{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create spark session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "http://10.150.236.40:4040\n"
     ]
    }
   ],
   "source": [
    "import sys, glob, os\n",
    "SPARK_HOME = \"/Users/abulbasar/Downloads/spark-2.3.1-bin-hadoop2.7\"\n",
    "sys.path.append(SPARK_HOME + \"/python\")\n",
    "sys.path.append(glob.glob(SPARK_HOME + \"/python/lib/py4j*.zip\")[0])\n",
    "from pyspark.sql import SparkSession, functions as F\n",
    "spark = (SparkSession\n",
    "         .builder\n",
    "         .config(\"spark.master\", \"local[*]\")\n",
    "         .config(\"spark.driver.memory\", \"4G\")\n",
    "         .config(\"spark.sql.shuffle.partitions\", 16)\n",
    "         .getOrCreate())\n",
    "sc = spark.sparkContext\n",
    "print(sc.uiWebUrl)\n",
    "sql = spark.sql"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create utility function to load data into a spark dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline \n",
    "\n",
    "base_dir = \"/data/movielens/\"\n",
    "\n",
    "def cache_df(df, name):\n",
    "    df.createOrReplaceTempView(name)\n",
    "    spark.catalog.cacheTable(name)\n",
    "    \n",
    "def load(file):\n",
    "    name = re.sub(\"[^A-Za-z0-9]\", \"_\", file[:-4])\n",
    "    df = spark.read.option(\"header\", True).option(\"inferSchema\", True).csv(base_dir + file)\n",
    "    cache_df(df, name)\n",
    "    df.alias(name)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+---------+-----------+\n",
      "|database|tableName|isTemporary|\n",
      "+--------+---------+-----------+\n",
      "|        | df_train|       true|\n",
      "|        |   movies|       true|\n",
      "|        |  ratings|       true|\n",
      "+--------+---------+-----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "movies = load(\"movies.csv\")\n",
    "ratings = load(\"ratings.csv\")\n",
    "\n",
    "sql(\"show tables\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Take a look at movies dataset. It shows by default first 20 records."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+--------------------+\n",
      "|movieId|               title|              genres|\n",
      "+-------+--------------------+--------------------+\n",
      "|      1|    Toy Story (1995)|Adventure|Animati...|\n",
      "|      2|      Jumanji (1995)|Adventure|Childre...|\n",
      "|      3|Grumpier Old Men ...|      Comedy|Romance|\n",
      "|      4|Waiting to Exhale...|Comedy|Drama|Romance|\n",
      "|      5|Father of the Bri...|              Comedy|\n",
      "|      6|         Heat (1995)|Action|Crime|Thri...|\n",
      "|      7|      Sabrina (1995)|      Comedy|Romance|\n",
      "|      8| Tom and Huck (1995)|  Adventure|Children|\n",
      "|      9| Sudden Death (1995)|              Action|\n",
      "|     10|    GoldenEye (1995)|Action|Adventure|...|\n",
      "|     11|American Presiden...|Comedy|Drama|Romance|\n",
      "|     12|Dracula: Dead and...|       Comedy|Horror|\n",
      "|     13|        Balto (1995)|Adventure|Animati...|\n",
      "|     14|        Nixon (1995)|               Drama|\n",
      "|     15|Cutthroat Island ...|Action|Adventure|...|\n",
      "|     16|       Casino (1995)|         Crime|Drama|\n",
      "|     17|Sense and Sensibi...|       Drama|Romance|\n",
      "|     18|   Four Rooms (1995)|              Comedy|\n",
      "|     19|Ace Ventura: When...|              Comedy|\n",
      "|     20|  Money Train (1995)|Action|Comedy|Cri...|\n",
      "+-------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "movies.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View the ratings data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+\n",
      "|userId|movieId|rating| timestamp|\n",
      "+------+-------+------+----------+\n",
      "|     1|     31|   2.5|1260759144|\n",
      "|     1|   1029|   3.0|1260759179|\n",
      "|     1|   1061|   3.0|1260759182|\n",
      "|     1|   1129|   2.0|1260759185|\n",
      "|     1|   1172|   4.0|1260759205|\n",
      "|     1|   1263|   2.0|1260759151|\n",
      "|     1|   1287|   2.0|1260759187|\n",
      "|     1|   1293|   2.0|1260759148|\n",
      "|     1|   1339|   3.5|1260759125|\n",
      "|     1|   1343|   2.0|1260759131|\n",
      "|     1|   1371|   2.5|1260759135|\n",
      "|     1|   1405|   1.0|1260759203|\n",
      "|     1|   1953|   4.0|1260759191|\n",
      "|     1|   2105|   4.0|1260759139|\n",
      "|     1|   2150|   3.0|1260759194|\n",
      "|     1|   2193|   2.0|1260759198|\n",
      "|     1|   2294|   2.0|1260759108|\n",
      "|     1|   2455|   2.5|1260759113|\n",
      "|     1|   2968|   1.0|1260759200|\n",
      "|     1|   3671|   3.0|1260759117|\n",
      "+------+-------+------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ratings.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How many rating levels are there? Find count by each rating level"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.5</td>\n",
       "      <td>1101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>3326</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.5</td>\n",
       "      <td>1687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2.0</td>\n",
       "      <td>7271</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2.5</td>\n",
       "      <td>4449</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>3.0</td>\n",
       "      <td>20064</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>3.5</td>\n",
       "      <td>10538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>4.0</td>\n",
       "      <td>28750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>4.5</td>\n",
       "      <td>7723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>5.0</td>\n",
       "      <td>15095</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rating  count\n",
       "4     0.5   1101\n",
       "1     1.0   3326\n",
       "2     1.5   1687\n",
       "9     2.0   7271\n",
       "0     2.5   4449\n",
       "8     3.0  20064\n",
       "6     3.5  10538\n",
       "5     4.0  28750\n",
       "7     4.5   7723\n",
       "3     5.0  15095"
      ]
     },
     "execution_count": 99,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.groupBy(\"rating\").count().toPandas().sort_values(\"rating\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find top 10 movies based on the highest avg rating. Each top movies must have at least 100 ratings to make the avg to reliable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+--------------------+\n",
      "|movieId|        avg_rating|               title|\n",
      "+-------+------------------+--------------------+\n",
      "|    318| 4.405144694533762|Shawshank Redempt...|\n",
      "|    858|             4.395|Godfather, The (1...|\n",
      "|   1221| 4.303703703703704|Godfather: Part I...|\n",
      "|     50|4.2835820895522385|Usual Suspects, T...|\n",
      "|    527| 4.209016393442623|Schindler's List ...|\n",
      "|    608| 4.191964285714286|        Fargo (1996)|\n",
      "|    912| 4.162393162393163|   Casablanca (1942)|\n",
      "|    296| 4.157407407407407| Pulp Fiction (1994)|\n",
      "|   1193| 4.138888888888889|One Flew Over the...|\n",
      "|   1196| 4.136752136752137|Star Wars: Episod...|\n",
      "+-------+------------------+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "top10 = (ratings\n",
    " .groupBy(\"movieId\")\n",
    " .agg(F.avg(\"rating\").alias(\"avg_rating\"), F.count(\"*\").alias(\"rating_count\"))\n",
    " .filter(\"rating_count > 100\")\n",
    " .join(movies, on = \"movieId\")\n",
    " .select(\"movieId\", \"avg_rating\", \"title\")\n",
    " .orderBy(F.desc(\"avg_rating\"))\n",
    " .limit(10)\n",
    ")\n",
    "top10.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does the rating pattern changes from user to user? For example, one user tends to give higher rating than other? Let's take a look at top 100 most rated movies, and group rating by users to find their median rating"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x61949a2b0>"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD8CAYAAABthzNFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEvJJREFUeJzt3X+w5XV93/HnSxZFTMxq9qqUhVxsd0yMo5VsiCltSkAbFAO20YSMMavBbNOQqE1mAthOsJ06g5PEX0lrshHrYoyA+IONYCKixOYPwMuPCrJYdnQDG4h74y80WOjiu3+c78Xrnc/uPffuPed8z97nY+bOOd8f53xf82U5r/P9eVJVSJK01OMmHUCS1E8WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNGyYd4HBs2rSpZmdnJx1DkqbKLbfc8g9VNbPcfFNdELOzs8zNzU06hiRNlSR/O8x87mKSJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqcmCkCQ1TfWV1JJ0pJu98Jrm+L2XnDXyZbsFIUlqsiAkSU0WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqcmCkCQ1jawgkrwnyf4kdy4a93tJ7k7yuSQfSbJx0bSLkuxJ8oUkPzOqXJKk4YxyC+K9wJlLxl0HPKeqngv8H+AigCTPBs4FfrR7zf9IctQIs0mSljGygqiqzwBfXTLuE1V1oBu8EdjcPT8HuLyqHq6qLwF7gFNGlU2StLxJHoP4FeDj3fPjgfsWTdvXjZMkTchECiLJfwIOAO9fGNWYrQ7y2u1J5pLMzc/PjyqiJK17Yy+IJNuAlwKvrKqFEtgHnLBots3A/a3XV9WOqtpaVVtnZmZGG1aS1rGxFkSSM4ELgLOr6qFFk3YB5yZ5QpKTgC3AzePMJkn6XiP7TeokHwBOAzYl2QdczOCspScA1yUBuLGqfq2qPp/kSuAuBruezq+qR0eVTZK0vJEVRFX9YmP0pYeY/83Am0eVR5K0Ml5JLUlqsiAkSU0WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJahrZldSS1pfZC6856LS9l5w1xiRaK25BSJKaLAhJUpMFIUlqsiAkSU0WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqWlkBZHkPUn2J7lz0binJrkuyT3d41O68UnyziR7knwuycmjyiVJGs4otyDeC5y5ZNyFwPVVtQW4vhsGeDGwpfvbDrxrhLkkSUMYWUFU1WeAry4ZfQ6ws3u+E3jZovGX1cCNwMYkx40qmyRpeeM+BvH0qnoAoHt8Wjf+eOC+RfPt68ZJkiakLwep0xhXzRmT7UnmkszNz8+POJYkrV/jLogvL+w66h73d+P3AScsmm8zcH/rDapqR1VtraqtMzMzIw0rSevZuAtiF7Cte74NuHrR+F/uzmZ6AfCNhV1RkqTJ2DCqN07yAeA0YFOSfcDFwCXAlUnOA+4FXtHNfi3wEmAP8BDwmlHlkiQNZ2QFUVW/eJBJZzTmLeD8UWWRJK1cXw5SS5J6xoKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqcmCkCQ1WRCSpCYLQpLUZEFIkposCElSkwUhSWqyICRJTRaEJKnJgpAkNVkQkqQmC0KS1DRUQSR5zqiDSJL6ZdgtiD9OcnOSX0+ycaSJJEm9MFRBVNW/BF4JnADMJfnzJC8aaTJJ0kQNfQyiqu4B/jNwAfCvgXcmuTvJv1vpQpP8xySfT3Jnkg8kOSbJSUluSnJPkiuSPH6l7ytJWjvDHoN4bpK3AbuB04Gfraof6Z6/bSULTHI88Dpga1U9BzgKOBd4C/C2qtoCfA04byXvK0laW8NuQfwRcCvwvKo6v6puBaiq+xlsVazUBuCJSTYAxwIPMCibq7rpO4GXreJ9JUlrZMOQ870E+HZVPQqQ5HHAMVX1UFW9byULrKq/S/L7wL3At4FPALcAX6+qA91s+4DjW69Psh3YDnDiiSeuZNGSpBUYdgvik8ATFw0f241bsSRPAc4BTgL+CfAk4MWNWav1+qraUVVbq2rrzMzMaiJIkoYwbEEcU1XfWhjonh+7ymW+EPhSVc1X1f8DPgz8C2Bjt8sJYDNw/yrfX5K0BoYtiH9McvLCQJIfY7B7aDXuBV6Q5NgkAc4A7gI+Dby8m2cbcPUq31+StAaGPQbxBuCDSRa+1R8H/MJqFlhVNyW5isFB7wPAbcAO4Brg8iT/rRt36WreX5K0NoYqiKr6bJIfBp4FBLi72z20KlV1MXDxktFfBE5Z7XtKktbWsFsQAD8OzHaveX4SquqykaSSJE3cUAWR5H3APwVuBx7tRhdgQUjSEWrYLYitwLOrqnnqqSTpyDPsWUx3As8YZRBJUr8MuwWxCbgryc3Awwsjq+rskaSSJE3csAXxplGGkCT1z7Cnuf51kh8CtlTVJ5Mcy+AurJKkI9Swt/v+VQZ3Wv2TbtTxwEdHFUqSNHnDHqQ+HzgVeBAe+/Ggp40qlCRp8oYtiIer6pGFge6mep7yKklHsGEL4q+TvJHBj/y8CPgg8BejiyVJmrRhC+JCYB64A/j3wLWs7pfkJElTYtizmL4D/Gn3J0laB4a9F9OXaBxzqKpnrnkiSVIvrOReTAuOAV4BPHXt40iS+mKoYxBV9ZVFf39XVW8HTh9xNknSBA27i+nkRYOPY7BF8f0jSSRJ6oVhdzH9waLnB4C9wM+veRpJUm8MexbTT486iCSpX4bdxfRbh5peVW9dmziSpL5YyVlMPw7s6oZ/FvgMcN8oQkmSJm8lPxh0clV9EyDJm4APVtVrRxVMkjRZw95q40TgkUXDjwCzq11oko1Jrkpyd5LdSX4yyVOTXJfknu7xKat9f0nS4Ru2IN4H3JzkTUkuBm4CLjuM5b4D+Muq+mHgecBuBvd7ur6qtgDXd8OSpAkZ9iymNyf5OPCvulGvqarbVrPAJE8Gfgp4dffejwCPJDkHOK2bbSdwA3DBapYhSTp8w25BABwLPFhV7wD2JTlplct8JoM7w/7PJLcleXeSJwFPr6oHALpHf5BIkiZo2J8cvZjBt/mLulFHA3+2ymVuAE4G3lVVzwf+kRXsTkqyPclckrn5+flVRpAkLWfYLYh/C5zN4MOcqrqf1d9qYx+wr6pu6oavYlAYX05yHED3uL/14qraUVVbq2rrzMzMKiNIkpYzbEE8UlVFd8vvbpfQqlTV3wP3JXlWN+oM4C4G11hs68ZtA65e7TIkSYdv2OsgrkzyJ8DGJL8K/AqH9+NBvwm8P8njgS8Cr2FQVlcmOQ+4l8EtxSVJEzLsWUy/3/0W9YPAs4DfrarrVrvQqrqd7/2NiQVnrPY9JUlra9mCSHIU8FdV9UJg1aUgSZouyx6DqKpHgYeS/MAY8kiSemLYYxD/F7gjyXV0ZzIBVNXrRpJKkjRxwxbENd2fJGmdOGRBJDmxqu6tqp3jCiRJ6ofljkF8dOFJkg+NOIskqUeWK4gsev7MUQaRJPXLcgVRB3kuSTrCLXeQ+nlJHmSwJfHE7jndcFXVk0eaTpJGYPbC9jk3ey85a8xJ+u2QBVFVR40riCSpX1byexCSpHXEgpAkNVkQkqQmC0KS1GRBSJKaLAhJUpMFIUlqsiAkSU0WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiZWEEmOSnJbko91wycluSnJPUmuSPL4SWWTJE12C+L1wO5Fw28B3lZVW4CvAedNJJUkCZhQQSTZDJwFvLsbDnA6cFU3y07gZZPIJkkamNQWxNuB3wG+0w3/IPD1qjrQDe8Djp9EMEnSwNgLIslLgf1Vdcvi0Y1Zm7+BnWR7krkkc/Pz8yPJKEmazBbEqcDZSfYClzPYtfR2YGOShZ9A3Qzc33pxVe2oqq1VtXVmZmYceSVpXRp7QVTVRVW1uapmgXOBT1XVK4FPAy/vZtsGXD3ubJKk7+rTdRAXAL+VZA+DYxKXTjiPJK1rG5afZXSq6gbghu75F4FTJplHkvRdfdqCkCT1iAUhSWqyICRJTRaEJKnJgpAkNVkQkqQmC0KS1GRBSJKaLAhJUpMFIUlqsiAkSU0WhCSpyYKQJDVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqWnDpANIWluzF17THL/3krPGnETTzoLQuuSH6JHF/56j4S4mSVLT2LcgkpwAXAY8A/gOsKOq3pHkqcAVwCywF/j5qvrauPNJ6q+DbSloNCaxBXEA+O2q+hHgBcD5SZ4NXAhcX1VbgOu7YUnShIx9C6KqHgAe6J5/M8lu4HjgHOC0bradwA3ABePOJ0nDOtKPfUz0GESSWeD5wE3A07vyWCiRpx3kNduTzCWZm5+fH1dUSVp3JlYQSb4P+BDwhqp6cNjXVdWOqtpaVVtnZmZGF1CS1rmJFESSoxmUw/ur6sPd6C8nOa6bfhywfxLZJEkDkziLKcClwO6qeuuiSbuAbcAl3ePV484m9dGRvp9b/TWJC+VOBV4F3JHk9m7cGxkUw5VJzgPuBV4xgWySpM4kzmL6GyAHmXzGOLNIkg7OK6klSU3ei0lS76zVFdNeeX14LAhJWsZ6LRp3MUmSmiwISVKTBSFJarIgJElNHqSWpDV2qIPa03QFvAUhrXPeykMH4y4mSVKTBSFJarIgJElNHoOQpDGapmM+bkFIkposCElSk7uYpsA0bZJKK7Feb4I3LdyCkCQ1WRCSpCZ3MUkj4q7B6eMur+9lQWii/BCV+stdTJKkJrcgpDFzq0ktfdy91bstiCRnJvlCkj1JLpx0Hklar3q1BZHkKOC/Ay8C9gGfTbKrqu6abLIj21p+o12r95qmb9lr9c2vb98gp+m/gUajVwUBnALsqaovAiS5HDgHWPOCWMsf9PB/pOX17cNvpaY9Pxy5RabR6dsupuOB+xYN7+vGSZLGLFU16QyPSfIK4Geq6rXd8KuAU6rqNxfNsx3Y3g0+C/jCKhe3CfiHw4g7btOUd5qywnTlnaasMF15pykrHF7eH6qqmeVm6tsupn3ACYuGNwP3L56hqnYAOw53QUnmqmrr4b7PuExT3mnKCtOVd5qywnTlnaasMJ68fdvF9FlgS5KTkjweOBfYNeFMkrQu9WoLoqoOJPkN4K+Ao4D3VNXnJxxLktalXhUEQFVdC1w7hkUd9m6qMZumvNOUFaYr7zRlhenKO01ZYQx5e3WQWpLUH307BiFJ6okjviCSvCfJ/iR3HmT6aUm+keT27u93x51xUZYTknw6ye4kn0/y+sY8SfLO7lYkn0tyco+z9mndHpPk5iT/u8v7XxrzPCHJFd26vSnJ7PiTDp311UnmF63b104i66I8RyW5LcnHGtN6sV6XZDpU3r6t271J7uiyzDWmj+wzoXfHIEbgvcAfAZcdYp7/VVUvHU+cQzoA/HZV3Zrk+4Fbkly35FYjLwa2dH8/Abyrexy3YbJCf9btw8DpVfWtJEcDf5Pk41V146J5zgO+VlX/LMm5wFuAX+hpVoArquo3JpCv5fXAbuDJjWl9Wa+LHSov9GvdAvx0VR3smoeRfSYc8VsQVfUZ4KuTzjGMqnqgqm7tnn+TwT/gpVeSnwNcVgM3AhuTHDfmqMNm7Y1ufX2rGzy6+1t6AO4cYGf3/CrgjCQZU8THDJm1N5JsBs4C3n2QWXqxXhcMkXfajOwz4YgviCH9ZLc5//EkPzrpMADdZvjzgZuWTOrd7UgOkRV6tG673Qq3A/uB66rqoOu2qg4A3wB+cLwpB4bICvBz3S6Fq5Kc0Jg+Lm8Hfgf4zkGm92a9dpbLC/1ZtzD4cvCJJLdkcCeJpUb2mWBBwK0MLjt/HvCHwEcnnIck3wd8CHhDVT24dHLjJRP7drlM1l6t26p6tKr+OYMr9E9J8pwls/Rm3Q6R9S+A2ap6LvBJvvsNfaySvBTYX1W3HGq2xriJrNch8/Zi3S5yalWdzGBX0vlJfmrJ9JGt33VfEFX14MLmfHcNxtFJNk0qT7fP+UPA+6vqw41Zlr0dybgsl7Vv63ZBVX0duAE4c8mkx9Ztkg3ADzDh3ZMHy1pVX6mqh7vBPwV+bMzRFpwKnJ1kL3A5cHqSP1syT5/W67J5e7RuF/Lc3z3uBz7C4K7Xi43sM2HdF0SSZyzsD01yCoN18pUJZQlwKbC7qt56kNl2Ab/cnbnwAuAbVfXA2EJ2hsnas3U7k2Rj9/yJwAuBu5fMtgvY1j1/OfCpmsCFQsNkXbKP+WwGx4DGrqouqqrNVTXL4NY4n6qqX1oyWy/WKwyXty/rtsvypO4kEJI8Cfg3wNIzMkf2mXDEn8WU5APAacCmJPuAixkc9KOq/pjBP9j/kOQA8G3g3En942Xw7eZVwB3d/meANwInwmN5rwVeAuwBHgJeM4GcMFzWPq3b44CdGfwo1eOAK6vqY0n+KzBXVbsYFN77kuxh8A333B5nfV2SsxmcTfZV4NUTytrU0/V6UD1et08HPtJ9z9oA/HlV/WWSX4PRfyZ4JbUkqWnd72KSJLVZEJKkJgtCktRkQUiSmiwISVKTBSFJarIgJElNFoQkqen/A2F48V1noMJ3AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x6194192b0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(ratings\n",
    " .join(top10, on = \"movieId\")\n",
    " .groupBy(\"userId\")\n",
    " .agg(F.avg(\"rating\").alias(\"avg_rating\"))\n",
    ").toPandas()[\"avg_rating\"].plot.hist(bins = 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We notice event for the that can be considered good movies in generated has a wide range of perception among the viewers."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Change the fraction rating values to intger to reduce the number of rating."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>1101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>5013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>11720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>30602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>36473</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5</td>\n",
       "      <td>15095</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rating  count\n",
       "5       0   1101\n",
       "2       1   5013\n",
       "3       2  11720\n",
       "1       3  30602\n",
       "4       4  36473\n",
       "0       5  15095"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings = ratings.withColumn(\"rating\", F.expr(\"cast(rating as int)\"))\n",
    "ratings.groupBy(\"rating\").count().toPandas().sort_values(\"rating\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the distribution of rating by plotting histogram."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x61869e518>"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAD8CAYAAAB6paOMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEWJJREFUeJzt3XusJnV9x/H3B0TBS0VkwQ2wLrbESo0XPBIaWmu9xYoCtmgxVleDbi82amiiaIzapk0wadVaTXW91BVFQBRBEBUQNCYtuitUUWihulW6xMULgkrFxW//eGbt8Xguc5Yz85zd3/uVnDwz88xz5stv9+Gzv/nN/CZVhSSpXftMuwBJ0nQZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTG3WvaBfRx8MEH1/r166ddhiTtUbZu3frdqlqz1H57RBCsX7+eLVu2TLsMSdqjJPnvPvt5akiSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhq3R9xZLKm/9WdcMu/2bWeesCL7a+9jj0CSGmcQSFLjDAJJatygYwRJtgF3AHcDO6tqJslBwLnAemAb8Nyq+sGQdUiSFjZGj+D3q+oxVTXTrZ8BXFFVRwFXdOuSpCmZxqmhk4DN3fJm4OQp1CBJ6gwdBAV8JsnWJBu7bYdW1S0A3eshA9cgSVrE0PcRHF9V25McAlyW5Ia+H+yCYyPAunXrhqpPkpo3aI+gqrZ3rzuAC4Bjge8kWQvQve5Y4LObqmqmqmbWrFnykZuSpN00WBAkuV+SB+xaBp4GXAdcBGzodtsAXDhUDZKkpQ15auhQ4IIku45zdlV9KsmXgPOSnAZ8C3jOgDVIkpYwWBBU1TeAR8+z/XvAk4c6riRpebyzWJIaZxBIUuMMAklqnM8jkBqx0HMHJHsEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXuXtMuQNLqtP6MS1bk92w784QV+T0ajj0CSWqcQSBJjTMIJKlxgwdBkn2TXJPk4m79yCRXJ7kxyblJ7j10DZKkhY3RI3gFcP2s9TcBb6mqo4AfAKeNUIMkaQGDBkGSw4ETgPd06wGeBJzf7bIZOHnIGiRJixu6R/BW4FXAz7v1BwO3VdXObv1m4LCBa5AkLWKw+wiSPBPYUVVbkzxx1+Z5dq0FPr8R2Aiwbt26QWqUNLyF7kfw/oLVY8gewfHAiUm2AecwOSX0VuDAJLsC6HBg+3wfrqpNVTVTVTNr1qwZsExJattgQVBVr6mqw6tqPXAq8Nmqej5wJXBKt9sG4MKhapAkLW0a9xG8Gjg9yU1MxgzeO4UaJEmdUeYaqqqrgKu65W8Ax45xXEnS0ryzWJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1bpRpqCVpLh9huXrYI5CkxhkEktQ4g0CSGucYgbQKLHS+fDGeS9dKsUcgSY0zCCSpcb2CIMkjhy5EkjQdfccI3pnk3sD7gbOr6rbhSpLUx+6MK0jz6dUjqKrfAZ4PHAFsSXJ2kqcOWpkkaRS9xwiq6kbgdcCrgd8D3pbkhiR/OFRxkqTh9R0jeFSStwDXA08CnlVVj+iW3zJgfZKkgfUdI3g78G7gtVV1566NVbU9yesGqUySNIq+QfAM4M6quhsgyT7A/lX1k6o6a7DqJEmD6ztGcDlwwKz1+3bbJEl7uL5BsH9V/WjXSrd832FKkiSNqW8Q/DjJMbtWkjwOuHOR/SVJe4i+YwSvBD6SZHu3vhb448U+kGR/4PPAfbrjnF9Vb0hyJHAOcBDwZeAFVXXX7hQvSbrnegVBVX0pyW8CDwcC3FBVP1viYz8FnlRVP0qyH/CFJJcCpwNvqapzkrwTOA34593/T5Ak3RPLmXTu8cCjgMcCz0vywsV2rold4wr7dT/F5N6D87vtm4GTl1WxJGlF9eoRJDkL+HXgWuDubnMBH1jic/sCW4HfAN4B/BdwW1Xt7Ha5GThsgc9uBDYCrFu3rk+ZkqTd0HeMYAY4uqpqOb+8u+/gMUkOBC4AHjHfbgt8dhOwCWBmZmZZx5Uk9df31NB1wEN29yDdbKVXAccBBybZFUCHA9sX+pwkaXh9ewQHA19P8kUmg8AAVNWJC30gyRrgZ1V1W5IDgKcAbwKuBE5hcuXQBuDC3axdkrQC+gbBG3fjd68FNnfjBPsA51XVxUm+DpyT5G+Ba4D37sbvliStkL6Xj34uyUOBo6rq8iT3BfZd4jNfYXKF0dzt3wCO3Z1iJUkrr+801C9lcsnnu7pNhwEfH6ooSdJ4+g4Wvww4HrgdfvGQmkOGKkqSNJ6+QfDT2dNAdFf9eEmnJO0F+gbB55K8Fjige1bxR4BPDFeWJGksfYPgDOBW4KvAnwKfZPL8YknSHq7vVUM/Z/KoyncPW44kaWx95xr6JvOMCVTVw1a8IknSqJYz19Au+wPPYfI8AUnSHq7XGEFVfW/Wz/9U1VuZTCctSdrD9T01dMys1X2Y9BAeMEhFkqRR9T019A+zlncC24Dnrng1kqTR9b1q6PeHLkSSNB19Tw2dvtj7VfXmlSlHkjS25Vw19Hjgom79WcDngW8PUZQkaTzLeTDNMVV1B0CSNwIfqaqXDFWYpDatP+OSBd/bduYJI1bSjr5TTKwD7pq1fhewfsWrkSSNrm+P4Czgi0kuYHKH8bOBDwxWlSRpNH2vGvq7JJcCv9ttenFVXTNcWZKksfTtEQDcF7i9qv4lyZokR1bVN4cqTJL6WmhcwTGFfvo+qvINwKuB13Sb9gM+OFRRkqTx9B0sfjZwIvBjgKrajlNMSNJeoW8Q3FVVRTcVdZL7DVeSJGlMfYPgvCTvAg5M8lLgcnxIjSTtFfpeNfT33bOKbwceDry+qi4btDJJ0iiWDIIk+wKfrqqnAP7PX5L2MkueGqqqu4GfJHngCPVIkkbW9z6C/wW+muQyuiuHAKrq5YNUJUnzWGweIu2+vkFwSfcjSdrLLBoESdZV1beqavNYBUmSxrXUGMHHdy0k+ejAtUiSpmCpIMis5Yct5xcnOSLJlUmuT/K1JK/oth+U5LIkN3avD1pu0ZKklbNUENQCy33sBP6qqh4BHAe8LMnRwBnAFVV1FHBFty5JmpKlBosfneR2Jj2DA7pluvWqql9b6INVdQtwS7d8R5LrgcOAk4AndrttBq5iMqGdJGkKFg2Cqtp3JQ6SZD3wWOBq4NAuJKiqW5IcshLHkCTtnr5zDe22JPcHPgq8sqpuX2r/WZ/bmGRLki233nrrcAVKUuMGDYIk+zEJgQ9V1ce6zd9JsrZ7fy2wY77PVtWmqpqpqpk1a9YMWaYkNW2wIEgS4L3A9VX15llvXQRs6JY3ABcOVYMkaWnLeVTlch0PvIDJ1BTXdtteC5zJZFrr04BvAc8ZsAZJ0hIGC4Kq+gK/fB/CbE8e6riSpOUZfLBYkrS6GQSS1DiDQJIaN+RgsaQ5nE9fq5E9AklqnEEgSY0zCCSpcY4RSANwLEB7EnsEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1zvsIpB4Wui9g25knjFyJtPLsEUhS4wwCSWqcQSBJjXOMQLoHnFNIewN7BJLUOINAkhpnEEhS4xwjUJO8L6AN/jn3Y49AkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGDRYESd6XZEeS62ZtOyjJZUlu7F4fNNTxJUn9DNkjeD/w9DnbzgCuqKqjgCu6dUnSFA0WBFX1eeD7czafBGzuljcDJw91fElSP2OPERxaVbcAdK+HjHx8SdIcq3awOMnGJFuSbLn11lunXY4k7bXGDoLvJFkL0L3uWGjHqtpUVTNVNbNmzZrRCpSk1owdBBcBG7rlDcCFIx9fkjTHkJePfhj4V+DhSW5OchpwJvDUJDcCT+3WJUlTNNjzCKrqeQu89eShjilJWr5VO1gsSRqHQSBJjTMIJKlxPrNYe7WFnlm7UvtLewN7BJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuOca0h7BecI0nIs9Pdl25knjFzJ6mCPQJIaZxBIUuMMAklqnGMEmqrlnqt1LEBaefYIJKlxBoEkNc4gkKTGOUagVcmxAE1Dq/cX2COQpMYZBJLUOINAkhrnGIGWrdXzqGrXSo1ZrdbviD0CSWqcQSBJjTMIJKlxqarxD5o8HfhHYF/gPVV15mL7z8zM1JYtW3brWMs9tzfNc3ir7dy71/JL03VPv/tJtlbVzFL7jd4jSLIv8A7gD4CjgeclOXrsOiRJE9M4NXQscFNVfaOq7gLOAU6aQh2SJKYTBIcB3561fnO3TZI0BdO4jyDzbPuVgYokG4GN3eqPkvzHALUcDHz3l477pgGOsny/VNcqqQnmaa9VwrqWZzXWtRprginXtch3v29dD+1znGkEwc3AEbPWDwe2z92pqjYBm4YsJMmWPgMpY7Ou5bGu5VmNda3GmqCduqZxauhLwFFJjkxyb+BU4KIp1CFJYgo9gqrameQvgU8zuXz0fVX1tbHrkCRNTGWuoar6JPDJaRx7jkFPPd0D1rU81rU8q7Gu1VgTNFLXVG4okyStHk4xIUmN2+uDIMn7kuxIct0C7yfJ25LclOQrSY5ZJXU9MckPk1zb/bx+pLqOSHJlkuuTfC3JK+bZZ/Q261nX6G2WZP8kX0zy711dfz3PPvdJcm7XXlcnWb8KanpRkltntdVLhqxpzrH3TXJNkovneW/UtlpGXVNpryTbkny1O+avzLOzYt/Fqtqrf4AnAMcA1y3w/jOAS5nc33AccPUqqeuJwMVTaK+1wDHd8gOA/wSOnnab9axr9Dbr2uD+3fJ+wNXAcXP2+Qvgnd3yqcC5q6CmFwFvH/vvV3fs04Gz5/uzGrutllHXVNoL2AYcvMj7K/Jd3Ot7BFX1eeD7i+xyEvCBmvg34MAka1dBXVNRVbdU1Ze75TuA6/nVO79Hb7OedY2ua4Mfdav7dT9zB95OAjZ3y+cDT04y342VY9Y0FUkOB04A3rPALqO21TLqWq1W5Lu41wdBD6t5yovf7rr3lyb5rbEP3nXLH8vkX5SzTbXNFqkLptBm3SmFa4EdwGVVtWB7VdVO4IfAg6dcE8AfdacTzk9yxDzvD+GtwKuAny/w/uht1bMumE57FfCZJFszmW1hrhX5LhoEPae8mIIvAw+tqkcD/wR8fMyDJ7k/8FHglVV1+9y35/nIKG22RF1TabOquruqHsPkLvljkzxyzi6jt1ePmj4BrK+qRwGX8///Ch9MkmcCO6pq62K7zbNt0LbqWdfo7dU5vqqOYTJb88uSPGHO+yvSXgZBzykvxlZVt+/q3tfkvov9khw8xrGT7Mfkf7YfqqqPzbPLVNpsqbqm2WbdMW8DrgKePuetX7RXknsBD2Sk04IL1VRV36uqn3ar7wYeN0I5xwMnJtnGZNbhJyX54Jx9ptFWS9Y1pfaiqrZ3rzuAC5jM3jzbinwXDYLJ9BYv7EbfjwN+WFW3TLuoJA/ZdW40ybFM/qy+N8JxA7wXuL6q3rzAbqO3WZ+6ptFmSdYkObBbPgB4CnDDnN0uAjZ0y6cAn61upG9aNc05j3wikzGXQVXVa6rq8Kpaz2Qg+LNV9Sdzdhu1rfrWNY32SnK/JA/YtQw8DZh7leGKfBencmfxmJJ8mMnVJAcnuRl4A5PBM6rqnUzucH4GcBPwE+DFq6SuU4A/T7ITuBM4degvROd44AXAV7tzzACvBdbNqm0abdanrmm02VpgcyYPXNoHOK+qLk7yN8CWqrqISYCdleQmJv+6PXUV1PTyJCcCO7uaXjRwTQuaclv1rWsa7XUocEH3b5t7AWdX1aeS/Bms7HfRO4slqXGeGpKkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ17v8A7BMfX7GhVRUAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x6186a4a20>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ratings.groupBy(\"userId\").agg(F.avg(\"rating\").alias(\"avg_rating\")).toPandas().avg_rating.plot.hist(bins = 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "How the average rating have changed over years."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0,0.5,'Avg Rating')"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xl4XFd5+PHvO6N910iyFku2JVteZHlJ4lh2iA1ZSJwdCrS0LGEL8LRsbWnZSkJC2XmgUCgQwtqSQkt/QGJwQggOjkO8JrZseY/lRZslW9bIsnbp/P6YeyeKrGVmNHcWzft5nnk8mrm6OnMtzTvnvOc9R4wxKKWUUgCuaDdAKaVU7NCgoJRSyk+DglJKKT8NCkoppfw0KCillPLToKCUUspPg4JSSik/DQpKKaX8NCgopZTyS4p2A4JVWFhoFixYEO1mKKVUXNm7d+95Y0zRdMfFXVBYsGABe/bsiXYzlFIqrojI6UCO0+EjpZRSfhoUlFJK+WlQUEop5adBQSmllJ8GBaWUUn4aFJRSSvk5NiVVRNKAbUCq9XN+aYx5YNwxXwdusL7MAOYYY/KcapNSSqmpOVmnMADcaIzpEZFkYLuIbDHG7LAPMMb8vX1fRD4IXOVgexJCx6UB/vzSee5eVYaIRLs5Sqk449jwkfHpsb5Mtm5TbQj918B/O9WeRPGVJ4/w4Z/v44HHGtD9t5VSwXI0pyAibhHZB7QDTxljdk5y3HygEvjjJM+/V0T2iMiejo4O5xoc5waHR3niYBuFWSn89PnTfEYDg1IqSI4GBWPMiDFmNVAOrBWR2kkOfTO+nMPIJOd52Bizxhizpqho2qU7Etb2Ex109w/z5Teu5L4Nlfzk+dM8+PghDQxKqYBFZO0jY0yXiDwDbAIOTnDIm4G/i0RbZrPN9a3kpCVx/aIiblgyB2Pgke2NADxwV43mGJRS03Jy9lERMGQFhHTgZuBLExy3BMgHnneqLYmgf2iEpxrOsam2hJQkXwfwU3cswwA/2N6ICNx/pwYGpdTUnOwplAI/ERE3vmGq/zHGbBaRh4A9xpjHrOP+Gvi50TGOGdl2rINLA8PcuarM/5iI8C93LMMY+OFzjQjCp+9cpoFBKTUpx4KCMaaeCaaYGmPuH/f1Z5xqQyLZXN9KfkYy1y0seMXjIr5AYDC+wCDwL3doYFBKTSzu9lNQV+obHOEPh89xz+q5JLuvnDsgItx/Zw1gDSXhG1rSwKCUGk+DwizwzNF2egdHuGtl6aTH2IFhbPJZA4NSajwNCrPA5vpWCrNSWFvpmfI4EeGBu3w9hkes5PMnb9fAoJR6mQaFOHd5YJinj5zjTddUkDTB0NF4dmAwxvD9ZxsRET5x21INDEopIIGCQktXH7870Mp7NlRFuylh9fSRdvqHRrlziqGj8USEz9y9HAM8vO0kAnxcA4NSigQKCr96sZmvPHmURXOyeM2SOdFuTths3t9CcU4q1y6YeuhoPBHhwbuXYwx8b9tJEPj4Jg0MSiW6hNlP4T0bKqkqyuT+3zTQPzThahpx51L/EM8c6+D2FaW4XMG/mYsID92znLeum8f3/nSSLz1xVJfEUCrBJUxQSE1y87nXreBMZy/f+uOJaDcnLP5w+ByDw8ENHY0nIjx0dy1vqZvHd//0Er/YfTaMLVRKxZuECQoA6xcW8BdXz+V7217iRPulaDdnxjbvb6UsN42rKvJndB6XS/jsPbVUz8ni8fqWMLVOKRWPEiooAHzq9mVkpCTxyV8djOuhEm/vENuOd3DHytCGjsZzuYRXLy5id+NF+gZnx/CaUip4CRcUCrJS+cRtS9nV2Mkv9zZFuzkhe/JQG0MjhjtXlk1/cIA2LC5icGSUnY0XwnZOpVR8SbigAPCXaypYMz+fz//uMBcvD0a7OSH5bX0rFZ50Vpbnhu2cdZUeUpJcbDt2PmznVErFl4QMCi6X8LnXr+BS/zBf2HI42s0J2sXLgzx34jx3rAjvPsxpyW7qKj08e1x3t1MqUSVkUABYUpLNezZU8T97mtjV2Bnt5gTliYY2hkfNjGYdTWZjdRHH23to6eoL+7mVUrEvYYMCwIduWsTcvHT+5dcHGBwejXZzAra5voXKwkyWl+WE/dwbFhcCsP24DiEplYgSOihkpCTx0D3LOXauh0e2n4x2cwLScWmA51+6wJ0rSx2pPl5SnM2c7FS26RCSUgkpoYMCwE3Litm0vIRvPn2cs5290W7OtJ5oaGPUENZZR2OJCBuqi9h+4jwjo/E7ZVcpFZqEDwoAD9xdg1uE+38T/tqFcJ9v8/4WFs3JYnFxVljPO9bGxYV09Q5xsNnr2M9QSsUmDQpAaW46/3DLErYe7eCJg21hOae3d4gP//xF1n/hjzSevxyWc57r7mfXqU7Hho5s1y/y5RW2HdMhJKUSjQYFy73r51NTmsNnHm/gUv/QjM717PEObv23bfy2vpXLA8O8+8e78fbO7JwAvzvQinFw6MhWkJVK7dwcntVks1IJR4OCJcnt4vN/sYL2SwN87aljIZ2jb3CE+39zkLf9YBdZaUn86m9fxQ/ecS1nL/byt4/uZWhkZjOcNte3srQkm0VznBs6sm2sLuKFMxdnHCCVUvFFg8IYqyvyeGvdfH7y51NBj6fvO9vFHd98lp8+f5p3X1/J5g9ez4ryXNZWevj861fw3IkLfOaxhpBzDC1dfew9fZG7VjnbS7BtqC5ieNTw/Eu65IVSiUSDwjj/tGkJBVmpfPJXBwKafTM0MsrXnjrGG77zZ/qHRnj0PXV8+s4a0pLd/mPetKaC9726ip/tPMOP/3wqpHb97kArAHesCH/B2kSumZ9PRopbh5CUSjAaFMbJSUvm03fWUN/k5Wc7T0957In2Ht7wnT/zzaePc8/qMrZ8ZCPXWUna8T5261JeW1PMZzcfYuvR9qDb9Xh9K7Vzc1hQmBn094YiJcnF+qoCrVdQKsE4FhREJE1EdonIfhFpEJEHJznuL0XkkHXMo061Jxh3rSxlQ3UhX3niKOe6+694fnTU8KPnGrnjm89ytrOX77zlar72l6vJTU+e9Jwul/Bvf7WaJSU5fPDRFzl2LvD9HM529rL/bJfjCebxNlQXcvpCL6cvhGf2lFIq9jnZUxgAbjTGrAJWA5tEZN3YA0SkGvgE8CpjzHLgIw62J2Aivk1nBkZG+ezmQ694rqWrj7f9cCcPPn6I6xYW8OTfb+S2AId0MlOT+MG9a0hPcfPun+zmQs9AQN+3uT6yQ0e2jYuLANimQ0hKJQzHgoLx6bG+TLZu4wfp7wO+bYy5aH1P8OMqDllQmMkHbljE5vpW/nSsA2MMv36xmVv/bRsvnuniC3+xgh++41rmZKcFdd6yvHS+//Y1tHcP8P7/2svA8PQb2vz2QAurKvKo8GSE+nJCUlmYydy8dJ7VegWlEoajOQURcYvIPqAdeMoYs3PcIYuBxSLynIjsEJFNk5znvSKyR0T2dHRE7g3qfa+uoqook0//+iAfePRFPvKLfSwuzmbLhzfw12vnhVxAtroij6++aRW7T13kk/9v6irqU+cvc7C5m7scWBF1OiLCxsVF/PmlCzOeTquUig+OBgVjzIgxZjVQDqwVkdpxhyQB1cBrgL8GHhGRvAnO87AxZo0xZk1RUZGTTX6F1CQ3//q6Ws509vL7Q23886Yl/M/71jO/YObJ3rtWlfGRm6v5vxea+O6fJl+Mb7O1Z/LtER46sm2sLqRnYJh9Z7ui8vOVUpGVFIkfYozpEpFngE3AwTFPNQE7jDFDQKOIHMUXJHZHol2BuG5hId996zUsKMxgaUl4l6r+8E3VvNRxmS8/eYSqokxuXV5yxTGb61tZMz+fsrz0sP7sQF23qBCXwLPHOrh2gScqbVBKRY6Ts4+K7E/9IpIO3AwcGXfYr4EbrGMK8Q0nxdwa1ptqS8IeEMA3PPOVN65kZXkeH/n5visK5k60X+JI2yVHNtMJVG56Mqsr8viTJpuVSghODh+VAltFpB7fJ/+njDGbReQhEbnbOuZJ4IKIHAK2Av9kjEmoEtq0ZDfff9s15GUkc99P99A+Zgrs4/tbESHg2U1O2VBdRH1TF1298bmftVIqcE7OPqo3xlxljFlpjKk1xjxkPX6/MeYx674xxvyDMabGGLPCGPNzp9oTy+bkpPHIvWvw9g1x30/30D80gjGG3x5oZe0CD8U5wc1wCreNi4swBraf0N6CUrOdVjTHiOVlufzbX62mvtnLP/7vfo60XeJEew93Rmito6msKs8lOy2JZ49pUFBqtotIolkF5pblJXxs01K+uOUIB5q8uARuq70y+RxpSW4X1y8qZNtxX72Gk3s5KKWiS3sKMeZ9G6t40zXlnOns5bqFhRRmpUa7SYAvr9Dq7eeljp7pD1ZKxS3tKcQYEeFzr19Bdloyd0Rx1tF4G6rt3djOs2hOdpRbo5RyivYUYlBKkov776rhmvn50W6KX4Ung6rCTF01ValZToOCCtiG6kJ2nLwQ0HpNSqn4pEFBBWzj4iL6h0bZc+pitJuilHKIBgUVsHVVBSS7RYeQlJrFNCiogGWmJnHN/Hy2ab2CUrOWBgUVlA3VRRxu7ab90pU70iml4p8GBRWUjdW+pcuf0yUvlJqVNCiooCwvy8GTmaJDSErNUhoUVFBcLuH6RYU8e/w8o6OT7xinlIpPGhRU0DYuLuJ8zwCH27qj3RSlVJhpUFBBs5e8eFY33lFq1tGgoIJWnJPG0pJsntV6BaVmHQ0KKiQbqgvZ3XiR3sHhaDdFKRVGGhRUSDZUFzE4MsrOxs5oN0UpFUYaFFRI1lZ6SE1yse2YDiEpNZtoUFAhSUt2s7bSo8lmpWYZDQoqZK9eXMSJ9h5auvqi3RSlVJhoUFAh22AteaGzkJSaPTQoqJAtLs6iOCeVbTqEpNSsoUFBhUxE2FBdxPbj5xnRJS+UmhUcCwoikiYiu0Rkv4g0iMiDExzzDhHpEJF91u09TrVHOWNDdSHeviEONHuj3RSlVBgkOXjuAeBGY0yPiCQD20VkizFmx7jjfmGM+YCD7VAO2lBdhAhsO9bB6oq8aDdHKTVDjvUUjE+P9WWyddMxhlnGk5lCbVmuJpsVALsaO7k8oFXu8czRnIKIuEVkH9AOPGWM2TnBYW8QkXoR+aWIVDjZHuWMDdWFvHCmix59M0hoXb2DvPnh5/nZztPRboqaAUeDgjFmxBizGigH1opI7bhDHgcWGGNWAn8AfjLReUTkvSKyR0T2dHToJ9JYs66qgJFRwwunL0a7KSqKmi72MWqg8fzlaDdFzUBEZh8ZY7qAZ4BN4x6/YIwZsL78PnDNJN//sDFmjTFmTVFRkaNtVcG7Zn4+bpewS9dBSmhtXt++3Wc7tZgxnjk5+6hIRPKs++nAzcCRcceUjvnybuCwU+1RzslMTaJ2bi47Gy9Euykqilq7fUHhTGdvlFuiZsLJnkIpsFVE6oHd+HIKm0XkIRG52zrmQ9Z01f3Ah4B3ONge5aC6Sg/7z3rpHxqJdlNUlLRay520dPVp3Uocc2xKqjGmHrhqgsfvH3P/E8AnnGqDipy6Sg8PbzvJvrNdrKsqiHZzVBTYw0fDo4ZWbx/l+RlRbpEKhVY0q7BYM9+DCJpXSGAt3j7cLgF0CCmeTdtTEJEDXFlf4AX2AP9qjNGBZEVuRjJLS3KsvEJ1tJujoqDN209tWQ77m7w0dfbBwmi3SIUikJ7CFuC3wFus2+PANqAN+LFjLVNxp67Sw97TFxkcHo12U1SEGWNo9fZz9fx8XAJnL2pPIV4FEhReZYz5hDHmgHX7FPAaY8yXgAXONk/Fk7pKD/1Doxxs0XWQEs3F3iEGhkepyM+gNDedszp8FLcCCQpZIlJnfyEia4Es60stYVV+11Z6AM0rJKJWr2/mUWluGhWedM0pxLFAgsJ7gEdEpFFETgGPAPeJSCbwBScbp+JLYVYqC4sy2XlS00yJprXLN/OoNC+divwMzl7UArZ4NW2i2RizG1ghIrmAWNXJtv9xrGUqLq2tLGDz/hZGRo1/Joqa/ezCtdLcNOZ5Mui4NEDf4AjpKe4ot0wFK5DZR6nAG/DlD5JEfH/oxpiHHG2Zikvrqjz8964zHG7tpnZubrSboyKktauPJJdQmJVKhcdXn9B0sZfq4uwot0wFK5Dho98A9+DLH1wec1PqCtcu0LxCImrz9lOck4bbJVR40gGdgRSvAqloLjfGbJr+MKWgLC+dCk86Oxsv8K7rK6PdHBUhrd5+SnLTAPw9BV0YLz4F0lP4s4iscLwlatZYu6CAXY2dGKPr3ySKVm8fpVZQKMpKJS3ZpdNS41QgQeF6YK+IHLU2wzlgLXKn1ITqqjxc7B3iRHvP9AeruGcXrtlBQUQoz8/QaalxKpDho9scb4WaVeqseoUdjZ2aaEwAduFaaW66/7GK/HSdlhqnJu0piEiOdffSJDelJjTPk0FxTqommxPE2MI12zxPBk2dvTqEGIem6ik8CtwJ7MW3IN7YSecGqHKwXSqOiQh1lQXsbLyAMQZ7GrOancYWrtkqPBlcGhjG2zdEXkZKtJqmQjBpT8EYc6f1b6Uxpsr6175pQFBTWlvp4Vz3gI4rJ4CxhWs2ey8F/f+PP9MmmkXk6UAeU2osO6+w86QOIc12bd6XC9ds83RaatyaKqeQJiIeoFBE8kXEY90WAGWRaqCKT4vmZOHJTGGn5hVmvdaulwvXbFrAFr+myim8D/gIvgCwl5dzCt3Atx1ul4pzIsLaBR52ndLF8Wa7sYVrtuy0ZPIyknX4KA5NlVP4hjGmEvjouJzCKmPMtyLYRhWn1lZ6ONvZR0uXDiHMZmML18aqyM/QArZxfvRcI2/7wc5oN2NK0+YUjDH/LiK1IvKXIvJ2+xaJxqn4tlb3V5j1xheujTXPk0GT1iq8wpaDbTx7/DxdvYPRbsqkAkk0PwD8u3W7AfgycLfD7VKzwLLSHLLTkjSvMIt1WYVrJWMK12zlnnSaL/YxMqq1CgAjo4aGZt+uhIdau6PcmskFsszFG4GbgDZjzDuBVUDq1N+iFLhdwrULPOxq1LzCbNViFa6VTTJ8NDgyyjlrymqiazzfw+XBEQAOtcR3UOgzxowCw1aVcztauKYCtLbSw0sdlznfMxDtpigHtHl9b/jjE80wdlqq5hUA9p/19RKSXBL3PYU9IpIHfB/fLKQXgF3TfZM1pXWXiOwXkQYReXCKY98oIkZE1gTcchUXNK8wu7VYQaEs78rhI/8S2ppXAOBAs5eMFDfrFxbEd0/BGPO3xpguY8x3gdcC91rDSNMZAG40xqwCVgObRGTd+INEJBv4EBDbKXkVkhVzc0lPdmtQmKUmKlyzleWlIaJVzbb6pi5qy3JZMTeXE+09DAyPRLtJEwqkp+BnjDkFDIjI9wM41hhj7LWTk63bRBmnz+JLXuvA4yyU7HZxzfx8TTbPUhMVrtlSk9yU5KTRpEGB4ZFRGlq6WVGeS01ZDsOjhuPnYnNp+akqmleKyO9F5KCI/KuIFIvI/wFPA4cCObmIuEVkH748xFPGmJ3jnr8KqDDGbJ7Ba1Axbm2lhyNt3Xh7h6LdFBVmExWujVXhydCqZuB4ew8Dw6OsLM+lptS3AHWs5hWm6il8H99KqW8AOvDlEk4Ci4wxXw/k5MaYEWPMaqAcWCsitfZzIuICvg7843TnEZH3isgeEdnT0dERyI9WMWRtpQdjYPep+O4tbD3SzgcefYGhkdFoNyVmtHVPExTyM3T9I3xDRwAry/OYX5BJRoo7ZvMKUwWFVGPMj40xR40x3wBGgY8bY4Ie5jHGdAHPAGP3es4GaoFnROQUsA54bKJkszHmYWPMGmPMmqKiomB/vIqy1RV5pLhd7IrzoPCrF5vZXN/Kf+04He2mxARjDC1dfRNOR7VVeNJp6+6nf8i58fM2bz/d/bHdC61v8pKdlsR8TwZul7C0JDtmewpTrX2UZg3v2IOFPcBKsRbHN8a8MNWJRaQIGDLGdIlIOnAz8CX7eWOMFygcc/wz+JbU2BPKC1GxKy3ZzeqKvLjPKzS0+KYUfv2pY9yzei6ezMTeJ2CqwjWbPS21uauPhUVZDrRhkI1f2crg8CiVhZnUzs1lxdwcVszNY/ncHHLSksP+M0NxoNnLirm5uKzcS01ZDr95sSUm9xuZKii0Al8b83XbmK8NcOM05y4FfiIibnw9kv8xxmwWkYeAPcaYx0Jss4pDays9fOdPL9EzMExWaiC7wMaW3sFhTp6/zO0rSniy4Rxff+oYn31d7fTfOItNVbhmqxhTq+BEUDjQ7GVweJS/XFOOt2+IF05f5PH9Lf7n7UCxcm4utXNzqZ2bQ3aEA8XA8AiHW7t51/WV/sdqSnP5rx1naLrY579GsWLSv05jzA0zObExph64aoLH75/k+NfM5Oep2La20sO3tp7ghdMX2bg4/oYAD7dewhh43eq5FGWl8p87TvOWdfNYWpIz/TfPUlMVrtkq8p2tVWiwxuU/cdsy8q2e24WeAQ40eznY7OVAs5e9pzqvCBQr5vqmhm5YXOj4/+Gxth6GRgyryvP8j9WU5fjbHzdBQalwumZ+Pm6XsKuxMy6DwiFr6Kh2bi5rKz38Zn8LDz52iEfvq4u57n+k2IVrpVMMH83JTiUlyeVYVXNDSzdz89L9AQGgICuV1yyZw2uWzPE/ZgeKA02+QLHnVCeP7W8h549JvPDp15LkDmp2flD2W0nmFXNz/Y8tKc7GJb4ZSJtqSxz72aHQoKAiIjM1idq5uXFbxNbQ0k1+RjKluWmICP/w2sXc/5sGnmw4F3N/1JHS5u3D7RKKsidfCs3lEsrz0x0MCl7/p+6pTBQofv1iMx/5xT4OtnSzuiJviu+emQNNXvIzkinPfzl4pqe4qSrKiskZSM6FR6XGqav0sO9sl6MzUZzS0NLN8rJcf6/gb9bOY3FxFp/73aG4fD3h0Ortpzg7dcLCtbEq8p2pVbg8MEzj+cvUluVOf/AErq/2zXN5/iVnF2ysb/ayojzvih5lTWkOh2NwBlIgS2dfPcFtoYhoL0MFpa7Sw+DIKPvOdkW7KUEZGhnlaNsllo/5RJrkdvHAXcs529nHD7Y3RrF10dPa1U/pBGsejVfhSefMhfAHhcOt3RjDK/5fglGYlcri4iyeP+lcUOgfGuHYuUusnHtl4Kopy6G5qy/m9lYIpKfwH8AO4GF8BW3PAz8HjonILQ62Tc0ya+Z7EIm/xfGOn+thcGT0imGKVy0q5JaaYr699URCLg89XeGabZ4ng+7+Ybx94a0lsJPMy+eGniheV1XAnlOdjhUkHmrtZmTUsLJ8gqAQo5XNgQSFU8BVVvHYNfhmFB3EV3fwZQfbpmaZ3IxklpbkxF1QsOsTlk8wTPGpO5YxPGL48hNHI92sqLIL10pzpg8K/hlIYc4rHGz24slMoSSANkxmfVUBvYMj1Dd5w9iyl9WffbmSeTz7Q0as5RUCCQpLjTEN9hfGmEP4gsRJ55qlZqu6Sg97T1+Mq6UiGlq6SU92U1mYecVz8wsyedf1lfzfC01xNyw2E3bhWmDDR76g0BTmvIIvz5Mzo9lfdVUFAOxwaAipvtlLUXYqxTlXJuMLs3yPx2NQOCoi3xGRV1u3/8A3dJQKxHZtuYo5dZUe+oZGONDszCczJxxq6WZZafakCdUP3LiIouxUPvNYA6MJsvVkq386auA9hXAuoT04PMrx9ksT9t6C4clMYWlJtmPJ5gNNXlbOzZ00cNWU5sTl8NE7gBPAR4C/x7co3jvwBYQZFbipxHNtnG26MzpqONTaPeWbT1ZqEv986xL2ne3iN/ubI9i66Gm1qpkDCQq5GcnkpCWFdWG8Y+cuMTRiQk4yj7WuqoA9pzsZHA5v7/XywDAnOnomHDqy1ZTlcKK9J6ZmsAUSFDYB3zLGvN4Y8zpjzFeNMb3GmNEx+yUoFZDCrFQWFmXGTVA409lLz8DwtG8+b7i6nJXluXxxyxEuDwxHqHXR0xpA4dpY4V5Cu2FMMeFMrasqoH9o1F9kFi4NLb7ZURMlmW01pbkMjxpOtMfOW2kgQeFufMNF/ykid+hUVDVTaysL2N3YyUgcDLX4Z7hMM0zhcgkP3LWcc90DfOeZlyLRtKhqDaBwbayK/IywDh81tHSTlepbdXSm1lX5ZsWFewjJXi57qsAVi8nmQLbjfCewCPhf4G+Al0TkEacbpmavdVUeLg0Mx2ThzngNLV6SXMLikukXc7tmfj6vW13Gw8+enPWb1QdauGar8KTTdLEvbDmXBivP4wrw508lLyOFZSU5YU821zd5KctNmzJwzvdk+PZWiKG/hYAqmo0xQ8AWfPUJe4F7nGyUmt2uXRA/eYWGlm4WzckiNckd0PEfu20pbhG+sOWwwy2LrtauwGoUbPM8GQwOj9LRMzDjnz0yajjUMnWeJ1jrqgrYe/piWPdNPtDsZcUUQ0fg62EuK82Jr56CiGwSkR/jSza/EXgE37LYSoWkLC+dCk86OxudXV4gHBqCfPMpzU3nb1+zkN8daHN8+YRoausOrJrZVu4JX61C4/nL9A2NhCXJbFu/sICB4VFePBOevIK3b4jG85enTDLb7BlIsTJzLdDZR78GFhtj7jXG/M4YM/szacpRdZUF7GrsxJjY+EOYSHt3P+d7BoJ+87lvYxVz89J58PGGuMibBMsYQ6s3sMI1WzinpU5VTBiqtZW+vEK4hpAOWlOup0oy22rKcugZGKbJoeXFgxVITuHNxphfG2MGAETkVSLybeebpmaztZUeLvYOxdSsi/FeTjIHFxTSkt188vZlHGm7xM93n3GiaVHV1TtE/1BghWs2e4XQcExLPdTSTYrbRXVx+DbtyU1PZnlZTth6d3aF9IoAZke9vNxFbNTuBJRTEJHVIvJlay/lfwWOONoqNevVWfUKsbxFp/2JNJClmce7fUUJays9fPXJo3h7Z1eNZzCFa7a0ZDfFOalhmZZ6sMXLkpJsksO8B8L6qgJePBOeVXwPNHcxz5NBXsb0W7YuKbH2VoiRvMKkV1VEFovI/SJyGPgWcBYQY8wNxph/j1gL1aw0z5NBcU5qjAeFbuYXZIS0faOI8MBdNXT1DfFGZiBCAAAe+0lEQVSNp4870LrosQvXgkk0g7WE9gyHj4wx/uUtwm39wgIGR0Z54czFGZ+rvmn6JLMtLdnNwqKsmJmBNFWoPQLcBNxljLneCgSxU3an4pqIWHmFCzGbV5jpm8/yslzefO08fvr8qZgeJguW3VMoC7BwzVbhmXlQaPH209U75EhQWLPAg0tgxwyHkDovD9J0sY9VAQYF8PVGY76nALwBaAO2isj3ReQmIDH3HVSOWFvp4Vz3QFiLmsKlu3+IM529M05mfvSWxaSnuPnX3x4KU8uir83bH1Thmq3Ck0Frd/+MlpOwE7g1YUwy23LSklkxN3fG+yvU+7ffDHw3t5rSHFq8/Vy8HP29FSYNCsaYXxlj/gpYCjyDb92jYmtxPN1HQc2YnVd4eNtJ2i/F1n4E9qe2UPIJYxVkpfLhm6p55mgHW4+0h6NpUdfi7QuqcM1WkZ+OMdDSFXqyuaGlG5fAstLskM8xlXULC9h3tou+wdAHRQ402UtwBP67Y/+exUJBZyCzjy4bY35mjLkTKAf2AR93vGVq1ls0J4tbaor52c4zrP/CH3n3j3fzxMHWsC9MFopQZx5N5O3rF1BZmMkXtxyZFVNU27zBFa7Z7CW0Z9IzPNTipaooi4wUZ1bbWVdVwNCIYe/p0PMK9c1eqooyg8pFxdKGO0Gl740xncaY7xljbnSqQSpxiAgPv30NT//jq3nvxioOtnh5/3+9QN3n/8BnHmvwz/6JhoYW3zr4c7JD38DFlpLk4qO3LOHouUv86sX4X0W11Rtc4Zptnl3ANoMZSE4lmW3XLvDgdgnPnzwf8jns5bKDUZCVSklOWkzkFcI7p0upECwsyuJjm5by3Mdu5EfvvJbrFhby6M4z3PHN7dz+jWf50XONdEZ4rPVQmN98bl9RwqryXL72+6MxtUxysEIpXLMV56SR7JaQaxUu9AzQ6u2n1oF8gi0rNYmV5bnsOBnarLj27n7auvsDqmQer6YsNvZWcCwoiEiaiOwSkf0i0iAiD05wzPtF5ICI7BOR7SJS41R7VOxLcru4Yckcvv2Wq9n5yZt48O7luF3Cg48fou7zf+D9/7mXpw+fY9jhXdv6h0Y43t4T1qAgInxs01JavP38147TYTtvpNmFa6EMH7ldwty89JB7CuEc0pvKuqoC9p/tCmkJdLtoLZBK5vFqSmNjbwUnewoDwI3GmFXAamCTiKwbd8yjxpgVxpjV+PZ7/pqD7VFxJD8zhXuvW8DjH7yeLR/ewNvXL2D3qU7e/ZM9rP/iH/nC7w5z6vxlR372sXOXGBk1YV1GAeC6RYVsXFzEt7aeoLs/Pgva/NNRQxg+gplNS20IU/J/OuurChgeNewJIa9Q3+zFJaG1saYsh+FRw/Fz0Z2+7FhQMD72q0u2bmbcMWP7Spnjn1cKYFlpDp++s4bnP3ET33vbNawqz+OR7Y3c9a3t9DiwoY2Tn0j/+dYldPUO8b0/xeeeC23doRWu2WYSFA62eCnPTw+oSngm1izIJ8klIS15caCpi+o52SElwmNluQtHcwoi4haRfUA78JQxZucEx/ydiLyEr6fwISfbo+JbSpKLW5eX8Mi9a/jpu9ZyqX+YZ46Gf5pnQ4uX7NQk/yJu4VQ7N5d7Vpfxg+2NnOuOrWm4gWjpCq1wzVaRn8HF3iEuhdBTCneeZzIZKUmsqsgLenE8Y0xAy2VPZp4ng8wUd9STzY4GBWPMiDU0VA6sFZHaCY75tjFmIfAx4F8mOo+IvFdE9ojIno6ODiebrOLEuqoCCjJT2HKwLeznPtjcTU1ZTlg2cJnIP752CSOjJi6Xvwi1cM1W4QltYbyegWEaz18O+5DeZNZXFXCg2RtUT7TV28/5nsGgKpnH8u+tEOVkc0RmHxljuvAVwG2a4rCfA6+b5PsfNsasMcasKSoqcqCFKt64XcIty4vZeqQ9rIm5kVHDkbbwbuAy3ryCDN5SN59f7D7LSx3xtfxFi7ePOSEUrtlCnZZqF3VFoqcAvnWQRkYNu4NYm8tfyRzCzCNbTVkOh1svRXVvBSdnHxWJSJ51Px24mXGrq4pI9Zgv7wDi76OTippNtaX0Do6w/Xjoc8rHO9nRQ//QqONvPh+4cRFpSS6++uRRR39OuLV5+4NaHXU8e0gu2LyCvbzFVPsdh9PV8/JJdktQQ0j1Tb6tW5eWhF5tXVPq21shHKvJhsrJnkIpvnWT6oHd+HIKm0XkIRG52zrmA9Z01X3APwD3OtgeNcusryogOy0prENI/iRzEEsUhKIwK5X7Nlax5WAbL4ZhVc5IafX2UxpiPgEgLyOZrNSkoDeUaWjppjArhTkhDlsFKz3FzVUV+UGtg3Sg2bekd1pyYFu3TsSetRTNvIKTs4/qjTFXGWNWGmNqjTEPWY/fb4x5zLr/YWPMcmPMamtJ7gan2qNmn5QkFzcvK+YPh88xFKbahYYWLylJLhYWhW8Dl8m8Z0MVhVkpfOmJI2FfKfbMhV7+4j+e49nj4cvB+QvXZtBTEBHK89ODXuqioaWbmrJcRCK3Jue6hQUcbPYGNH3YGEN9kzek+oSxFhdn43ZJVPMKWtGs4tqm2hK8fUPsDLECdbyGlm6WOrCBy0SyUpP44I3V7DjZyZ+Ohe/Nu9Xbx988soMXznTxv3uawnZeb1/ohWtjzQtyWurA8AjHz12iNkL5BNu6Kg+jhoDyCmc7+/D2DYVUyTyWb2+FzNnZU1AqEjZWF5Ge7GbLwdYZn8vJDVwm89dr5zHPk8GXnjgaluRix6UB3vL9nXh7h1hVnhvWfbDt6agzGT4CX61C08W+gNt1rK2HYQeKCadz9bx8UpJcAdUr7Pcvlz3zNtZEeQaSBgUV19JT3NywtIgnG87NeAXS5i7fpz0n1uqfTEqSi3+8ZTGHW7t5bH/LjM518fIgb/vBTlq9/fzondfyxjUVtHX3h22/CrtwrTRvZj2Fivx0+oZGON8T2HpW9sKIkQzW4PvUfvW8vIDyCgeafcOOi4tnvqR3TVkOrd7+iK/3ZdOgoOLerctLON8zMONtFCO1ts54d60sY3lZDl/9/VEGhkObXtvdP8S9P9rFyfOXeeTeNaxZ4Hl5H+wwDa293FOY4fBRQXBLaB9s8ZKVmuSfzhpJ66oKONTaPe0+2/VNXSwrzSElaeZvqTWlvg8l0dpbQYOCins3Lp1DitvFEzOcheTfwKUkskHB5RI+fttSmi728bMdZ4L+/t7BYd79490caunmu2+9mlctKgSgek4WnswUdjTObCcxm124NtPlxO1pqU0BTrv0JZmdKyacyvqqAoyBnVNcw9FRw8Hm7qCXy55MtGcgaVBQcS87LZnrqwt54mDbjMbP7Q1c0lNCn1IYqg3VRbxqUQHf2noiqCUg+odGeO9P97L39EW+8earuHFpsf85EWHtAk/Yegqt3v4ZFa7ZyoOoVRgZNRxpvRTx3ptt9bw8UpNcUw4hnTx/mZ6B4RnPPLJ5MlMozU2LWl5Bg4KaFTYtL6G5q4+DzaH/IUU6yTzexzYtpfPyIN/fdjKg44dGRvnAoy+w/cR5vvLGVdyxsvSKY+qqPDR39QX8qXwqM52OaktPcVOYlRrQ8FHj+R76hkYinmS2pSa5uWZ+/pT7Kxxo9iWZZzrzaKya0hztKSg1EzfXFON2CU80hDYLqfPyIK3e/qgGhZXledyxspRHtjdOu2f1yKjhI7/Yxx8Ot/PZ19XyhmvKJzyurrIAgF1BLNcwmbYZFq6NNc+THtD6R3aQD2a/43BbX1XA4dZuLk6S+K1v8pJuTSUNl5qyHE50RGdvBQ0KalbwZKZQV+lhS4hDSC/PcInOJ1LbR29ZwuDwKP/+9IlJjxkdNfzzL+v5bX0rn7p9GW9bN3/SY5eUZJOTljTjISRjDC3evhnXKNgqPBkBLeUQyWLCyaxf6Ausk+UVDjR5WV6WQ1IYa1tqSnMYidLeChoU1KyxqbaEkx2XOdEe/B9StGYejVdZmMmb11bw37vOTLiJkDGGBx5r4P9eaOLvb17MfRurpjyf2yWsrfRMmSgNhF24Fo7hI/Alm1u9/dNWokeymHAyK8vzSE92TziENDwySkNLd8jLZU/Gn2yOwt4KGhTUrHHr8hKAkGYhNbR0MzfP+Q1cAvGhm6pJdrv46u9fuVieMYYvbjnCf+44zfteXcWHbloU0PnqKgs4daF3Rvs3hKtwzTbPk8HIqKG1a/I2GWM42OyNeqBOSXKxZkH+hEVsJzp8OY9VYcwngC9oZqUmRSWvoEFBzRrFOWlcPS8vpAXyGlq8jm/zGKg52Wnct6GSzfWtHGh6+ZPiN54+zve2neTt6+fz8U1LA14HqK7KqleYQV4hXIVrtnJ7X4UphpCaLvbR3T8c9SE98NUrHD13iQs9A6943N6TOdw9Bd/eCtlRmYGkQUHNKrfVlnKotZszFwKfbXPZv4FLbAQFgPs2VuHJ9C2WB/Dwtpf4tz8c543XlPOZu5YHtTBcTWkOWalJ7AxyJ7Gx7L2Zwzl8BFNPS42VIT3wBQW4MrAeaPIV1lUWhC/JbLNnIEV6bwUNCmpWsYeQnmwIvLdwpK0bY6KfZB4rOy2ZD9ywiO0nzvPR/93P5393hDtXlvKlN6wMuogrye3imvn5M+optHb14xIoygrP0tWluWm4XTJlT+FQixeXwNIIFxNOZGV5Lhkp7iuGkOqbvdTOdaawrqYsh8uDI2FbpiRQGhTUrDKvIIOa0pygFsiLpU+kY71l3TzK89P55d4mbl42h6//1eqQC8fqqjycaO/h/Ljhj0C1evspzkkL2wybJLeLuXnpnJliWurBlm4WzYlOMeF4yW4XaxZ4XlHENjg8yuHW7rDWJ4xlL3cR6SEkDQpq1rmttoQXznQFnFhtaO4mPyM5bEMj4ZKa5OYrb1zFO65bwLf+5uoZzcCx6xWC2V5yrNYwTke1VXjSpxk+8sZU7219VQEn2nv8NSTHzl1icHg0bJXM41UXZ/n2VohwslmDgpp1NtUGN4TU0Op784nkBi6BWr+wgM/cvXxGu3mBb/gjPdkd8hBSm7efsjDNPLJV5GdMWmndcWmAc90DMdV789crWFNT7STzyrnO9BTSkt0sKsrSnoJSM1VdnM3CosyApqYOjYxyrK0npt58nJBs5RWC2XPYFu7CNVuFJ4PzPYNcHhi+4jm7mDBWZoQB1Jb5Evb2ENKB5i5y05Op8IQ3WI5VUxb55S40KKhZaVNtCTsbO6ddk/74uR4GR0Zj6s3HKWsrPRw9d4mu3uDW6Q934ZqtwmOvlnplXuHlPE/sDB8luV1cuyCfHVay2d5+08keZk1pDm3d/VdMhXWSBgU1K21aXsrIqOEPh85NeVysLG8RCXWVHoyB3aeC23fi5emo4R4+smoVJsgrHGrppsKTTm56clh/5kytX1jAyfOXOdvZy9G2S2HZaW0q9oeVw62XHP05Y2lQULNS7dwc5ual88Q0eYWGlm7Sk91UFoZ/nnmsWVWRR0qSK+h6hVav75O8E8NHMHEBW0OLl+WlsReo7XqFHz7XyPCocWzmka2mNPLLXWhQULOSiLCptoTtx89PuT9BQ4uXZaXZM94jIB6kJbu5qiIv6GSz3VMoC1M1s60gM4WMFPcV8/C7+4c4daE3qiujTmZ5WS7ZaUn8YvdZAMdmHtnyM1Moy02LaF5Bg4KatW6rLWFwZJQ/Hmmf8PnRUcOhlm5qHR4CiCV1lR4aWrx0B7GRT7gL12wiQkV+xhVLaB+OwXyCze0S6io99A6OUJiVEpFpzDVlORGdgaRBQc1aV8/Lpyg7ddKpqac7e7k8ODLrZx6NVVdVwKiBvUHkFcJduDZWhSf9immpsVpMaLOHkFbMjcw05prSHF7quByxvRUcCwoikiYiu0Rkv4g0iMiDExzzDyJySETqReRpEZl8YXilguRyCbfUFLP1SAd9g1f+QSVSktl29bx8kt0S1BBSW3f4p6PaKjwZnOnsfcUeGAdbvBRlpzInJ7aKCW3+oOBwPsFWU+bbW+HYucgkm53sKQwANxpjVgGrgU0ism7cMS8Ca4wxK4FfAl92sD0qAd1WW0rf0Ajbjndc8VxDSzdJLqG6OHobuERaeoqbleV5Qe2v0NrV79gwSUV+Br2DI6+YOnwoytuiTmd5WQ4P3FXDW+rmReTn+Ze7iFBewbGgYHzs3U6SrZsZd8xWY4zdd9wBTLynoFIhqqvykJuezJMTFLI1tHRTXZxNalL019aJpLpKDweavPQOXlk0Np4xhtYwbsM53sszkHx5hf6hEY63x3YxoYjwzldVUhyhnkx5fjrZqUkRyys4mlMQEbeI7APagaeMMTunOPzdwBYn26MST7LbxWtrinnq8DkGh1/e5csYw6GW6G/gEg1rKz0Mjxr2np4+r+DtG6JvaMS5noLnlbUKx85dYmTUJNSQ3nR8eytErrLZ0aBgjBkxxqzG1wNYKyK1Ex0nIm8F1gBfmeT594rIHhHZ09Fx5TCAUlPZtLyES/3Dr1jhsv3SAOd7BhMyKKxZ4MHtEnYFkFdwqnDNZu+rYE9LPdjse+Or1aDwCjVlORxujczeChGZfWSM6QKeATaNf05EbgY+BdxtjJmwltsY87AxZo0xZk1RUZGjbVWzz/XVhWSmuHlizHLaiZhktmWlJlFbluNf2G0qThWu2TJTkyjITPHPQGpo8ZKdluToekLxqKY0cnsrODn7qEhE8qz76cDNwJFxx1wFfA9fQJh4MrlSM5SW7OaGpXP4fcM5RqxPWg3WJ9JlpdnRbFrU1FUVsO9s17TTHMO949pEyj0v1yo0tHRTU5oTkyvWRpO93EUk8gpO9hRKga0iUg/sxpdT2CwiD4nI3dYxXwGygP8VkX0i8piD7VEJbFNtCRcuD7LnlO/TcUNLNwsKMshOi621dSKlrtLD4MgoL57pmvK4Nq+vcG1OdngL18aa58ng7MVehkd8m9YkUjFhoKqLs6ir9JCa5PzgTpJTJzbG1ANXTfD4/WPu3+zUz1dqrBuWzCElycWWg23UVRXQ0Op1bB38eLBmgQcR2Nl4wb9PwERauvqZk+1M4ZqtIj+dLQdaOd7ew8DwaELmeaaTmuTmF+9bH5GfpRXNKiFkpiaxsbqIJxva8PYOcbazLyGWy55Mbnoyy0pypk02t3X3URrmNY/Gq/BkMDxq/MuRJGKeJ5ZoUFAJY1NtCa3efn6++wwQu8soREpdlYcXzlx8xVTd8ZwsXLPZM5C2HGwlNcnFwqLZv2JtLNOgoBLGzcvmkOQSvvunlwD9RFpXWUD/0Cj1TRPnFZwuXLPNswrYDjZ3s7Q0x9GhKjU9vfoqYeRlpLB+YQEXe4eYk51KkYPJ03iwttIDMOk6SN19w44WrtlK89KwVy5P9N5bLNCgoBLKrctLAH3zAfBkprC4OGvSfZtbHK5RsCW7Xf7eiP6/RJ8GBZVQblleTJJLWFWRuDOPxqqrLGDv6YsMj1yZV2hzuJp5LHsISSuZo0+Dgkooc7LTePyD13PfhqpoNyUm1FX5Now5OMG6OnZPIRIbyczzZOB2CUtKErOYMJY4VqegVKxaVqpDFDZ/XuHkBVaP6z1FonDN9u4Nlaxb6CEtObFWrI1F2lNQKoHNyU6jqjBzwmRzJArXbIuLs3n9VbpyfizQoKBUgqur8rD7VKd/XSibkzuuqdilQUGpBFdXWcCl/mEOj1tsrdXbT5nD1cwq9mhQUCrB1VVdWa9gjKG1q5+SHF3COtFoUFAqwZXmpjPPk8HOMfUKduGa9hQSjwYFpRRrKz3sOtXp39krUoVrKvZoUFBKUVfpoat3iOPtPUBkC9dUbNGgoJRiXZVvT4Wdjb4hpEgWrqnYokFBKUV5fjpluWn+fZsjWbimYosGBaUUIkJdVQE7Gy/4l8yOVOGaii36P66UAnzJ5vM9g5w8f5lWrxauJSoNCkopwJdsBth5slML1xKYBgWlFACVhZkUZaeys/GCFq4lMA0KSinAyitUeth6pD0iO66p2KRBQSnlV1fpobt/GPBtk6kSjwYFpZRfnVWvAFqjkKg0KCil/KrnZOHJTAG0mjlRORYURCRNRHaJyH4RaRCRByc4ZqOIvCAiwyLyRqfaopQKjIiwdoEHl0CRFq4lJCe34xwAbjTG9IhIMrBdRLYYY3aMOeYM8A7gow62QykVhPe/ZiFrFuSTrIVrCcmxoGCMMUCP9WWydTPjjjkFICKjTrVDKRWc1RV5V+zXrBKHox8FRMQtIvuAduApY8zOEM/zXhHZIyJ7Ojo6wttIpZRSfo4GBWPMiDFmNVAOrBWR2hDP87AxZo0xZk1RUVF4G6mUUsovIoOGxpgu4BlgUyR+nlJKqdA4OfuoSETyrPvpwM3AEad+nlJKqZlzsqdQCmwVkXpgN76cwmYReUhE7gYQkWtFpAl4E/A9EWlwsD1KKaWm4eTso3rgqgkev3/M/d348g1KKaVigE5EVkop5adBQSmllJ/4aszih4h0AKdD/PZC4HwYmzMb6TWaml6f6ek1mlq0rs98Y8y0c/rjLijMhIjsMcasiXY7Ypleo6np9ZmeXqOpxfr10eEjpZRSfhoUlFJK+SVaUHg42g2IA3qNpqbXZ3p6jaYW09cnoXIKSimlppZoPQWllFJTiPugICI/FJF2ETk45rFVIvK8iBwQkcdFJMd6PEVEfmQ9vl9EXjPme54RkaMiss+6zYnCywk7EakQka0ictjaAe/D1uMeEXlKRI5b/+Zbj4uIfFNETohIvYhcPeZc91rHHxeRe6P1msIpzNdnZMzvz2PRek3hFsI1Wmr9/Q2IyEfHnWuT9Xd2QkQ+Ho3XE25hvj6nrPenfSKyJxqvB2NMXN+AjcDVwMExj+0GXm3dfxfwWev+3wE/su7PAfYCLuvrZ4A10X49DlyfUuBq6342cAyoAb4MfNx6/OPAl6z7twNbAAHWATutxz3ASevffOt+frRfX6xcH+u5nmi/nhi5RnOAa4HPAR8dcx438BJQBaQA+4GaaL++WLk+1nOngMJovp647ykYY7YBneMeXgJss+4/BbzBul8DPG19XzvQBcTsfOFwMMa0GmNesO5fAg4Dc4F7gJ9Yh/0EeJ11/x7gp8ZnB5AnIqXArfgWNew0xlzEd13jfin0MF6fWSvYa2SMaTe+dc2Gxp1qLXDCGHPSGDMI/Nw6R1wL4/WJCXEfFCZxELjbuv8moMK6vx+4R0SSRKQSuGbMcwA/srptnxYRiVxzI0NEFuBbpHAnUGyMaQXfLzW+Ty/g+2U+O+bbmqzHJnt81pjh9QFIs3YI3CEir2MWCvAaTUZ/h6ZngN+LyF4Rea9T7ZyKY6ukRtm7gG+KyP3AY8Cg9fgPgWXAHnxLZfwZGLaee4sxpllEsoH/A94G/DSirXaQiGThe10fMcZ0TxHzJnrCTPH4rBCG6wMwzxjTIiJVwB9F5IAx5iUHmhsVQVyjSU8xwWOJ+Ds0lVdZv0NzgKdE5Ig1GhIxs7KnYIw5Yoy5xRhzDfDf+MYxMcYMG2P+3hiz2hhzD5AHHLeea7b+vQQ8iq+rOyuISDK+X9afGWP+n/XwOXvYw/q33Xq8iVf2nsqBlikej3thuj4YY+x/T+LLUV2xdHy8CvIaTUZ/h6Yx5neoHfgVUXgfmpVBwZ45JCIu4F+A71pfZ4hIpnX/tcCwMeaQNZxUaD2eDNyJbwgq7lnDYD8ADhtjvjbmqccAewbRvcBvxjz+dmuWzTrAa3V9nwRuEZF8axbFLdZjcS1c18e6LqnWOQuBVwGHIvIiHBbCNZrMbqBaRCpFJAV4s3WOuBau6yMimdZIBdb71C1E430omlnucNzw9QRa8SVtmoB3Ax/GNwPgGPBFXi7SWwAcxZcI+gO+VQMBMvHNRKoHGoBvAO5ov7YwXZ/r8XXR64F91u12oABf0v249a/HOl6Ab+PrXR1gzIwsfMNyJ6zbO6P92mLp+gDXWV/vt/59d7RfWxSvUYn1t9iNbzJHE5BjPXe79Xf5EvCpaL+2WLo++GZl7bduDdG6PlrRrJRSym9WDh8ppZQKjQYFpZRSfhoUlFJK+WlQUEop5adBQSmllJ8GBaWUUn4aFJSKAhFxR7sNSk1Eg4JS0xCRz9pr5Ftff05EPiQi/yQiu8W3r8KDY57/tbWgWcPYRc1EpEdEHhKRncD6CL8MpQKiQUGp6f0Aa7kCa+mUNwPngGp8a9OsBq4RkY3W8e8yvnW31gAfEpEC6/FMfPt+1BljtkfyBSgVqNm6SqpSYWOMOSUiF0TkKqAYeBHfJim3WPcBsvAFiW34AsHrrccrrMcvACP4Fk1TKmZpUFAqMI8A78C3bs0PgZuALxhjvjf2IPFt8XozsN4Y0ysizwBp1tP9xpiRSDVYqVDo8JFSgfkVvp3mrsW3OuyTwLusNfQRkbnW6ry5wEUrICzFt2WnUnFDewpKBcAYMygiW4Eu69P+70VkGfC8tZlKD/BW4Ang/SJSj29F3h3RarNSodBVUpUKgJVgfgF4kzHmeLTbo5RTdPhIqWmISA2+PSSe1oCgZjvtKSillPLTnoJSSik/DQpKKaX8NCgopZTy06CglFLKT4OCUkopPw0KSiml/P4/wv04nl3dTyUAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x61864bba8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "avg_rating_by_year = (ratings\n",
    " .withColumn(\"year\", F.expr(\"year(from_unixtime(timestamp))\"))\n",
    " .groupBy(\"year\")\n",
    " .agg(F.avg(\"rating\").alias(\"avg_rating\"))\n",
    ").toPandas()\n",
    "\n",
    "avg_rating_by_year = avg_rating_by_year.sort_values(\"year\")\n",
    "avg_rating_by_year.index = avg_rating_by_year.year\n",
    "\n",
    "avg_rating_by_year.avg_rating.plot()\n",
    "\n",
    "plt.ylabel(\"Avg Rating\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find number of unique users."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(count(DISTINCT userId)=671)"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.selectExpr(\"count(distinct userId)\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find number of unique movieIds."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(count(DISTINCT movieId)=9066)"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings.selectExpr(\"count(distinct movieId)\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Does all the movie id mentioned in rating present in the movies table as well? Expected count from the statement below should be 0, if all values are present. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------+---------+\n",
      "|movieId|userId|rating|timestamp|\n",
      "+-------+------+------+---------+\n",
      "+-------+------+------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ratings.join(movies, on = [\"movieId\"], how = \"leftanti\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What is distribution of the number of rating by each movie"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+\n",
      "|summary|             count|\n",
      "+-------+------------------+\n",
      "|  count|              9066|\n",
      "|   mean|11.030664019413193|\n",
      "| stddev|24.050799967665892|\n",
      "|    min|                 1|\n",
      "|    max|               341|\n",
      "+-------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ratings.groupBy(\"movieId\").count().select(\"count\").describe().show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the average rating by each user and rating count."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-----+------------------+\n",
      "|userId|count|        avg_rating|\n",
      "+------+-----+------------------+\n",
      "|    12|   61|2.7540983606557377|\n",
      "|    18|   51| 3.235294117647059|\n",
      "|    38|  111| 4.045045045045045|\n",
      "|    67|  103|3.7572815533980584|\n",
      "|    70|   83| 4.204819277108434|\n",
      "|    93|  159|3.5723270440251573|\n",
      "|   161|   90| 3.588888888888889|\n",
      "|   186|   42|3.5952380952380953|\n",
      "|   190|   60|               4.0|\n",
      "|   218|   42|3.6904761904761907|\n",
      "|   225|   28| 4.107142857142857|\n",
      "|   257|  103| 3.378640776699029|\n",
      "|   261|   50|              3.44|\n",
      "|   263|  114| 3.043859649122807|\n",
      "|   273|   92| 4.119565217391305|\n",
      "|   275|  202| 4.262376237623762|\n",
      "|   280|   22| 4.318181818181818|\n",
      "|   295|  236|  3.73728813559322|\n",
      "|   300|   44|3.8181818181818183|\n",
      "|   317|   22|               3.5|\n",
      "+------+-----+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "avg_rating = ratings.groupBy(\"userId\").agg(F.count(\"*\").alias(\"count\"), F.avg(\"rating\").alias(\"avg_rating\"))\n",
    "avg_rating.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Which movies are more consistent rating from users. We can take standard deviation as measure of consistency. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+------------------+-----+\n",
      "|movieId|avg(rating)|               std|count|\n",
      "+-------+-----------+------------------+-----+\n",
      "|   2483|        2.5|3.5355339059327378|    2|\n",
      "|  27178|        2.5|3.5355339059327378|    2|\n",
      "|   6219|        2.5|3.5355339059327378|    2|\n",
      "|  75341|        2.0|2.8284271247461903|    2|\n",
      "|   7577|        2.0|2.8284271247461903|    2|\n",
      "|  98122|        2.0|2.8284271247461903|    2|\n",
      "|   4562|        3.0|2.8284271247461903|    2|\n",
      "|   4404|        3.0|2.8284271247461903|    2|\n",
      "|  31364|        2.0|2.8284271247461903|    2|\n",
      "|   3415|        3.0|2.8284271247461903|    2|\n",
      "|   2824|        3.0|2.8284271247461903|    2|\n",
      "|  66246|        2.0|2.8284271247461903|    2|\n",
      "|   6127|        2.0|2.8284271247461903|    2|\n",
      "|   3490|        3.0|2.8284271247461903|    2|\n",
      "|   3892|        3.0|2.8284271247461903|    2|\n",
      "| 135436|        3.0|2.8284271247461903|    2|\n",
      "|   4630|        3.0|2.8284271247461903|    2|\n",
      "|   5179|        3.0|2.8284271247461903|    2|\n",
      "|  95199|        2.0|2.8284271247461903|    2|\n",
      "|   8574|        2.0|2.8284271247461903|    2|\n",
      "+-------+-----------+------------------+-----+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "(ratings\n",
    " .groupBy(\"movieId\")\n",
    " .agg(F.avg(\"rating\")\n",
    "       , F.stddev(\"rating\").alias(\"std\")\n",
    "       , F.count(\"*\").alias(\"count\"))\n",
    " .filter(\"not isnan(std)\")\n",
    " .orderBy(F.desc(\"std\"))\n",
    ").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Devide the dataset into training and test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train, df_test = (ratings.randomSplit([0.7, 0.3], seed = 1))\n",
    "cache_df(df_train, \"df_train\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Apply matrix factorization using ALS (alternate least square) from mllib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.recommendation import ALS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "als = ALS(rank=10, maxIter=5, seed=0,\n",
    "          userCol= \"userId\", itemCol= \"movieId\", ratingCol=\"rating\")\n",
    "als_model = als.fit(df_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "View the iterm factors. These factors are found by ALS model. These represent latent properties of the each movie. Rank or the dimension is 10 as we specied in the ALS model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>features</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "      <td>[-0.6119903326034546, 0.2717360258102417, 0.08...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>[-0.5250846743583679, 0.8127381205558777, 0.32...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30</td>\n",
       "      <td>[-1.2488008737564087, 0.31565776467323303, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>40</td>\n",
       "      <td>[-0.4015123248100281, -0.24699324369430542, -0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>50</td>\n",
       "      <td>[-0.9165343046188354, -0.3675995469093323, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>[-0.85568767786026, 0.21060891449451447, 0.085...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>70</td>\n",
       "      <td>[-0.7626186013221741, -0.6614616513252258, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>[-1.188989520072937, -0.6905847787857056, -1.0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>100</td>\n",
       "      <td>[-1.0212198495864868, 0.11593789607286453, 0.4...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>110</td>\n",
       "      <td>[-0.6032668948173523, -0.10099033266305923, -0...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id                                           features\n",
       "0   10  [-0.6119903326034546, 0.2717360258102417, 0.08...\n",
       "1   20  [-0.5250846743583679, 0.8127381205558777, 0.32...\n",
       "2   30  [-1.2488008737564087, 0.31565776467323303, -0....\n",
       "3   40  [-0.4015123248100281, -0.24699324369430542, -0...\n",
       "4   50  [-0.9165343046188354, -0.3675995469093323, -0....\n",
       "5   60  [-0.85568767786026, 0.21060891449451447, 0.085...\n",
       "6   70  [-0.7626186013221741, -0.6614616513252258, -0....\n",
       "7   80  [-1.188989520072937, -0.6905847787857056, -1.0...\n",
       "8  100  [-1.0212198495864868, 0.11593789607286453, 0.4...\n",
       "9  110  [-0.6032668948173523, -0.10099033266305923, -0..."
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "als_model.itemFactors.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find all the movies by userId = 100. It is just example user."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "|movieId|userId|rating|timestamp|               title|              genres|\n",
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "|     32|   100|     5|854193977|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|\n",
      "|   1073|   100|     5|854194056|Willy Wonka & the...|Children|Comedy|F...|\n",
      "|      3|   100|     4|854194024|Grumpier Old Men ...|      Comedy|Romance|\n",
      "|    608|   100|     4|854194024|        Fargo (1996)|Comedy|Crime|Dram...|\n",
      "|    745|   100|     4|854194208|Wallace & Gromit:...|Animation|Childre...|\n",
      "|    802|   100|     4|854194111|   Phenomenon (1996)|       Drama|Romance|\n",
      "|   1356|   100|     4|854194086|Star Trek: First ...|Action|Adventure|...|\n",
      "|     25|   100|     4|854193977|Leaving Las Vegas...|       Drama|Romance|\n",
      "|      1|   100|     4|854193977|    Toy Story (1995)|Adventure|Animati...|\n",
      "|      6|   100|     3|854194023|         Heat (1995)|Action|Crime|Thri...|\n",
      "|      7|   100|     3|854194024|      Sabrina (1995)|      Comedy|Romance|\n",
      "|     52|   100|     3|854194056|Mighty Aphrodite ...|Comedy|Drama|Romance|\n",
      "|    135|   100|     3|854194086|Down Periscope (1...|              Comedy|\n",
      "|    708|   100|     3|854194056|Truth About Cats ...|      Comedy|Romance|\n",
      "|     62|   100|     3|854193977|Mr. Holland's Opu...|               Drama|\n",
      "|     86|   100|     3|854194208| White Squall (1996)|Action|Adventure|...|\n",
      "|    141|   100|     3|854193977|Birdcage, The (1996)|              Comedy|\n",
      "|    648|   100|     3|854193977|Mission: Impossib...|Action|Adventure|...|\n",
      "|    733|   100|     3|854194024|    Rock, The (1996)|Action|Adventure|...|\n",
      "|     95|   100|     3|854193977| Broken Arrow (1996)|Action|Adventure|...|\n",
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "ratings.filter(\"userId = 100\").join(movies, \n",
    "            on = \"movieId\").orderBy(F.desc(\"rating\")).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Item to item similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We see movieId = 32 has been one the top rated movie for user = 100. Find out movies similar to this movie = 32. For similarity consine similarity has been used on the iterm vectors. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-------------------------------------------------------------------------+--------------------+\n",
      "|movieId|title                                                                    |similarity          |\n",
      "+-------+-------------------------------------------------------------------------+--------------------+\n",
      "|32     |Twelve Monkeys (a.k.a. 12 Monkeys) (1995)                                |0.0                 |\n",
      "|46322  |Jet Li's Fearless (Huo Yuan Jia) (2006)                                  |0.008989705001690163|\n",
      "|2275   |Six-String Samurai (1998)                                                |0.010203252621868497|\n",
      "|2502   |Office Space (1999)                                                      |0.010878541752999515|\n",
      "|49957  |History Boys, The (2006)                                                 |0.014252301792101796|\n",
      "|62344  |Rachel Getting Married (2008)                                            |0.01599857601792687 |\n",
      "|76077  |Hot Tub Time Machine (2010)                                              |0.017122048712169047|\n",
      "|55814  |Diving Bell and the Butterfly, The (Scaphandre et le papillon, Le) (2007)|0.017737225957067615|\n",
      "|7061   |Dark Victory (1939)                                                      |0.018288454968269807|\n",
      "|7437   |Connie and Carla (2004)                                                  |0.01838704386978518 |\n",
      "+-------+-------------------------------------------------------------------------+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.sql.types import DoubleType\n",
    "import numpy as np\n",
    "import scipy\n",
    "import scipy.spatial\n",
    "\n",
    "def distance(v1, v2):\n",
    "    v1 = np.array(v1)\n",
    "    v2 = np.array(v2)\n",
    "    return float(scipy.spatial.distance.cosine(v1, v2))\n",
    "\n",
    "spark.udf.register(\"distance\", distance, DoubleType())\n",
    "\n",
    "def recommendation_by_i2i(movie_id):\n",
    "    return (als_model\n",
    "     .itemFactors\n",
    "     .filter(F.col(\"id\") == movie_id)\n",
    "     .alias(\"t1\")\n",
    "     .crossJoin(als_model.itemFactors.alias(\"t2\"))\n",
    "     .withColumn(\"similarity\", F.expr(\"distance(t1.features, t2.features)\")) \n",
    "     .join(movies, F.col(\"t2.id\") == F.col(\"movieId\"))\n",
    "     .orderBy(F.asc(\"similarity\"))\n",
    "     .select(\"movieId\", \"title\", \"similarity\")\n",
    "    )\n",
    "\n",
    "recommendation_by_i2i(32).show(10, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+---------------------------------------------+--------------------+\n",
      "|movieId|title                                        |similarity          |\n",
      "+-------+---------------------------------------------+--------------------+\n",
      "|10     |GoldenEye (1995)                             |0.0                 |\n",
      "|3249   |Hand That Rocks the Cradle, The (1992)       |0.019739230609736924|\n",
      "|2406   |Romancing the Stone (1984)                   |0.02105859648238928 |\n",
      "|454    |Firm, The (1993)                             |0.021639110689007923|\n",
      "|1393   |Jerry Maguire (1996)                         |0.023246487402172167|\n",
      "|733    |Rock, The (1996)                             |0.023711253436034863|\n",
      "|2      |Jumanji (1995)                               |0.025654501255907514|\n",
      "|867    |Carpool (1996)                               |0.02693095927125455 |\n",
      "|747    |Stupids, The (1996)                          |0.027019454687792543|\n",
      "|103235 |Best Offer, The (Migliore offerta, La) (2013)|0.02740922637162979 |\n",
      "+-------+---------------------------------------------+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendation_by_i2i(10).show(10, False) # movieId for GoldenEye (1995) is 10 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Like item Factors we can see the user factors representing the latent propeties of the user."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>features</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "      <td>[-0.27867308259010315, 0.5459980964660645, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>[-0.9683108925819397, 0.06215322017669678, 0.1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>30</td>\n",
       "      <td>[-0.4050779938697815, 0.39974474906921387, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>40</td>\n",
       "      <td>[-0.3554266691207886, -0.3823551535606384, -0....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>50</td>\n",
       "      <td>[-0.5981726050376892, 0.3503716289997101, 0.10...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>[-0.266533762216568, -0.4195394814014435, -0.5...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>70</td>\n",
       "      <td>[-0.8268013000488281, 0.2105957567691803, 0.26...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>80</td>\n",
       "      <td>[-0.7081996202468872, -0.46596139669418335, -0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>90</td>\n",
       "      <td>[-0.5121882557868958, -0.030749192461371422, 0...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>100</td>\n",
       "      <td>[-0.65791916847229, 0.05630388855934143, -0.17...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    id                                           features\n",
       "0   10  [-0.27867308259010315, 0.5459980964660645, -0....\n",
       "1   20  [-0.9683108925819397, 0.06215322017669678, 0.1...\n",
       "2   30  [-0.4050779938697815, 0.39974474906921387, -0....\n",
       "3   40  [-0.3554266691207886, -0.3823551535606384, -0....\n",
       "4   50  [-0.5981726050376892, 0.3503716289997101, 0.10...\n",
       "5   60  [-0.266533762216568, -0.4195394814014435, -0.5...\n",
       "6   70  [-0.8268013000488281, 0.2105957567691803, 0.26...\n",
       "7   80  [-0.7081996202468872, -0.46596139669418335, -0...\n",
       "8   90  [-0.5121882557868958, -0.030749192461371422, 0...\n",
       "9  100  [-0.65791916847229, 0.05630388855934143, -0.17..."
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "als_model.userFactors.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### User to User similarity"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find top movies by user; assume, the top movies are those that the user has rated 4 or 5. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+--------------------+\n",
      "|userId|          top_movies|\n",
      "+------+--------------------+\n",
      "|    12|[1220, 1235, 3798...|\n",
      "|    18|[785, 17, 32, 786...|\n",
      "|    38|[110, 356, 8197, ...|\n",
      "|    67|[610, 356, 589, 1...|\n",
      "|    70|[81, 52, 785, 25,...|\n",
      "|    93|[44195, 1097, 499...|\n",
      "|   161|[161, 356, 589, 5...|\n",
      "|   186|[4306, 4226, 4873...|\n",
      "|   190|[2716, 2390, 2585...|\n",
      "|   218|[2186, 4226, 1200...|\n",
      "|   225|[356, 589, 153, 5...|\n",
      "|   257|[3000, 1222, 3080...|\n",
      "|   261|[356, 589, 590, 7...|\n",
      "|   263|[356, 2803, 590, ...|\n",
      "|   273|[31696, 8874, 441...|\n",
      "|   275|[356, 3450, 71579...|\n",
      "|   280|[1917, 17, 2006, ...|\n",
      "|   295|[466, 356, 364, 4...|\n",
      "|   300|[110, 356, 589, 1...|\n",
      "|   317|[356, 150, 165, 3...|\n",
      "+------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "top_rated_movies_by_user = (ratings\n",
    "                            .filter(\"rating = 4 or rating = 5\")\n",
    "                            .groupBy(\"userId\")\n",
    "                            .agg(F.collect_set(\"movieId\").alias(\"top_movies\")))\n",
    "top_rated_movies_by_user.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------------------------+\n",
      "|movieId|title                                 |\n",
      "+-------+--------------------------------------+\n",
      "|589    |Terminator 2: Judgment Day (1991)     |\n",
      "|4995   |Beautiful Mind, A (2001)              |\n",
      "|70286  |District 9 (2009)                     |\n",
      "|6440   |Barton Fink (1991)                    |\n",
      "|4822   |Max Keeble's Big Move (2001)          |\n",
      "|1222   |Full Metal Jacket (1987)              |\n",
      "|3421   |Animal House (1978)                   |\n",
      "|1193   |One Flew Over the Cuckoo's Nest (1975)|\n",
      "|48780  |Prestige, The (2006)                  |\n",
      "|608    |Fargo (1996)                          |\n",
      "+-------+--------------------------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def recommendation_by_u2u(user_id):\n",
    "    return (als_model\n",
    "     .userFactors\n",
    "     .filter(F.col(\"id\") == user_id)\n",
    "     .alias(\"t1\")\n",
    "     .crossJoin(als_model.itemFactors.alias(\"t2\"))\n",
    "     .withColumn(\"similarity\", F.expr(\"distance(t1.features, t2.features)\"))\n",
    "     .filter(\"similarity < 0.03\") # 0.02 Similarity threshold - a hyper parameter. We can perform tuning to find the suitable value\n",
    "     .join(top_rated_movies_by_user.alias(\"t3\"), F.col(\"t2.id\") == F.col(\"t3.userId\"))\n",
    "     .select(\"t1.id\", F.explode(\"top_movies\").alias(\"movieId\"))\n",
    "     .join(movies, on = \"movieId\")\n",
    "     .select(\"movieId\", \"title\")\n",
    "    )\n",
    "\n",
    "recommendation_by_u2u(100).show(10, False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "|movieId|userId|rating|timestamp|               title|              genres|\n",
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "|     32|   100|     5|854193977|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|\n",
      "|   1073|   100|     5|854194056|Willy Wonka & the...|Children|Comedy|F...|\n",
      "|      3|   100|     4|854194024|Grumpier Old Men ...|      Comedy|Romance|\n",
      "|    608|   100|     4|854194024|        Fargo (1996)|Comedy|Crime|Dram...|\n",
      "|    745|   100|     4|854194208|Wallace & Gromit:...|Animation|Childre...|\n",
      "|    802|   100|     4|854194111|   Phenomenon (1996)|       Drama|Romance|\n",
      "|   1356|   100|     4|854194086|Star Trek: First ...|Action|Adventure|...|\n",
      "|     25|   100|     4|854193977|Leaving Las Vegas...|       Drama|Romance|\n",
      "|      1|   100|     4|854193977|    Toy Story (1995)|Adventure|Animati...|\n",
      "|      6|   100|     3|854194023|         Heat (1995)|Action|Crime|Thri...|\n",
      "|      7|   100|     3|854194024|      Sabrina (1995)|      Comedy|Romance|\n",
      "|     52|   100|     3|854194056|Mighty Aphrodite ...|Comedy|Drama|Romance|\n",
      "|    135|   100|     3|854194086|Down Periscope (1...|              Comedy|\n",
      "|    708|   100|     3|854194056|Truth About Cats ...|      Comedy|Romance|\n",
      "|     62|   100|     3|854193977|Mr. Holland's Opu...|               Drama|\n",
      "|     86|   100|     3|854194208| White Squall (1996)|Action|Adventure|...|\n",
      "|    141|   100|     3|854193977|Birdcage, The (1996)|              Comedy|\n",
      "|    648|   100|     3|854193977|Mission: Impossib...|Action|Adventure|...|\n",
      "|    733|   100|     3|854194024|    Rock, The (1996)|Action|Adventure|...|\n",
      "|     95|   100|     3|854193977| Broken Arrow (1996)|Action|Adventure|...|\n",
      "+-------+------+------+---------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "(ratings\n",
    " .join(movies, on = \"movieId\")\n",
    " .filter(\"userId = 100\")\n",
    " .orderBy(F.desc(\"rating\"))\n",
    ").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's see what results the simiarity match between the user factors and movies factors give."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+----------------------------------------------+\n",
      "|similarity          |title                                         |\n",
      "+--------------------+----------------------------------------------+\n",
      "|0.02291440172371373 |Mary Poppins (1964)                           |\n",
      "|0.033117143234550506|Bus 174 (Ônibus 174) (2002)                   |\n",
      "|0.03588223110136224 |Fantastic Mr. Fox (2009)                      |\n",
      "|0.04020801883324454 |Christmas Story, A (1983)                     |\n",
      "|0.04325626207399513 |Drop Dead Gorgeous (1999)                     |\n",
      "|0.044425947834241364|Once (2006)                                   |\n",
      "|0.0445282917289207  |Manon of the Spring (Manon des sources) (1986)|\n",
      "|0.04523102711801197 |Elizabeth (1998)                              |\n",
      "|0.04547643759010589 |Tom Jones (1963)                              |\n",
      "|0.04608449690531313 |Others, The (2001)                            |\n",
      "|0.047883872328780996|Lord of the Rings, The (1978)                 |\n",
      "|0.048786415902191016|Tom & Viv (1994)                              |\n",
      "|0.049018831374267435|Sense and Sensibility (1995)                  |\n",
      "|0.04912316446316434 |Jungle Book, The (1967)                       |\n",
      "|0.0503961480536359  |Every Little Step (2008)                      |\n",
      "|0.05106430868804357 |Grand Budapest Hotel, The (2014)              |\n",
      "|0.05118087639425928 |Roman Holiday (1953)                          |\n",
      "|0.05150845853014585 |My Dog Skip (1999)                            |\n",
      "|0.05153310806439826 |Jean de Florette (1986)                       |\n",
      "|0.055603525479829274|Ben-Hur (1959)                                |\n",
      "+--------------------+----------------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def recommendation_by_u2i(userId):\n",
    "    return (als_model\n",
    "     .userFactors.alias(\"t1\")\n",
    "     .filter(F.col(\"id\") == userId)\n",
    "     .crossJoin(als_model.itemFactors.alias(\"t2\"))\n",
    "     .withColumn(\"similarity\", F.expr(\"distance(t1.features, t2.features)\"))\n",
    "     .orderBy(F.asc(\"similarity\"))\n",
    "     .join(movies, F.col(\"t2.id\") == F.col(\"movieId\"))\n",
    "     .select(\"similarity\", \"title\")\n",
    "     .limit(20)\n",
    "    )\n",
    "recommendation_by_u2i(100).show(20, False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predict the rating for each movie and user combination in the df_test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   157|     12|     2|1291598164| 2.2820864|\n",
      "|   564|     12|     1| 974709821| 3.0669582|\n",
      "|    91|     12|     3|1448813887| 3.8011193|\n",
      "|   624|     12|     2|1019563753| 2.0999196|\n",
      "|   636|     18|     3| 855227364| 3.5984566|\n",
      "|   616|     18|     4| 860573132| 3.3177824|\n",
      "|   571|     18|     5|1334342436| 2.6853292|\n",
      "|   135|     18|     4| 844996129| 3.0119503|\n",
      "|   255|     18|     2|1236980522|   2.22563|\n",
      "|   461|     18|     1|1091959887| 2.5870395|\n",
      "|   408|     18|     5| 933116210| 2.7201877|\n",
      "|   507|     18|     4| 862091839| 3.9740381|\n",
      "|   177|     18|     4| 907380994| 3.8975425|\n",
      "|    30|     18|     2| 945277971|   3.56654|\n",
      "|   655|     18|     4|1470073389| 3.5001454|\n",
      "|   165|     70|     5|1111480089| 2.4006498|\n",
      "|   358|     70|     1| 957534713| 2.4248333|\n",
      "|    34|     70|     4| 973746231| 3.1537461|\n",
      "|   580|     70|     2|1165292373| 2.4286094|\n",
      "|   647|     70|     3| 947292818| 2.8044093|\n",
      "+------+-------+------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_prediction = als_model.transform(df_test)\n",
    "df_prediction.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find predictions of rating for user 575 and compare it against the actual rating. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   575|    613|     4|1012594056| 3.0942912|\n",
      "|   575|    912|     5|1012593307| 3.9532857|\n",
      "|   575|   1256|     5|1012595762| 3.5967388|\n",
      "|   575|   1479|     2|1012594286| 2.6165662|\n",
      "|   575|   1943|     3|1012606205|  2.818478|\n",
      "|   575|   2716|     5|1012593003| 3.3207188|\n",
      "|   575|   2728|     3|1012604429| 3.7774029|\n",
      "|   575|   2968|     5|1012597696|   3.29526|\n",
      "|   575|   3196|     4|1012604409|  3.940699|\n",
      "|   575|   3516|     3|1018056630| 1.8980706|\n",
      "|   575|   5232|     4|1018058258|       NaN|\n",
      "|   575|    539|     3|1012593690| 3.1185648|\n",
      "|   575|    913|     5|1012593062| 3.9654267|\n",
      "|   575|   1022|     4|1012594848| 3.2438915|\n",
      "|   575|   1597|     4|1012594028| 2.3618226|\n",
      "|   575|   2099|     3|1012594848| 4.1307707|\n",
      "|   575|   3244|     4|1012596068| 3.0916982|\n",
      "|   575|   3421|     3|1012595812| 3.5597823|\n",
      "|   575|   5034|     5|1012593616| 3.2644007|\n",
      "|   575|    848|     4|1012605150|  3.644644|\n",
      "+------+-------+------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_prediction.filter(\"userId = 575\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For for some movie prediction is nan. Likely, those movies do not have any records in the df_train."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   514|     83|     3| 853893246|       NaN|\n",
      "|   311|     83|     3| 898008016|       NaN|\n",
      "|   383|     83|     3| 852809207|       NaN|\n",
      "|   564|    108|     4| 974713920|       NaN|\n",
      "|   555|    114|     3| 857379668|       NaN|\n",
      "|   452|    119|     3| 976423036|       NaN|\n",
      "|    86|    124|     3| 848161391|       NaN|\n",
      "|   665|    129|     3| 995232528|       NaN|\n",
      "|    41|    130|     4|1093889645|       NaN|\n",
      "|   575|    148|     4|1012605106|       NaN|\n",
      "|   564|    184|     5| 974715756|       NaN|\n",
      "|   254|    184|     4| 845158097|       NaN|\n",
      "|   312|    187|     3| 959930966|       NaN|\n",
      "|   439|    187|     2|1041115023|       NaN|\n",
      "|   393|    187|     3|1058472448|       NaN|\n",
      "|   564|    189|     4| 974838288|       NaN|\n",
      "|   182|    243|     4| 845745917|       NaN|\n",
      "|   647|    245|     3| 947292322|       NaN|\n",
      "|   516|    251|     3| 844687985|       NaN|\n",
      "|   330|    301|     5| 948574126|       NaN|\n",
      "+------+-------+------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_prediction.filter(\"isnan(prediction)\").orderBy(\"movieId\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   575|    148|     4|1012605106|       NaN|\n",
      "+------+-------+------+----------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_prediction.filter(\"movieId = 148\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+---------+\n",
      "|userId|movieId|rating|timestamp|\n",
      "+------+-------+------+---------+\n",
      "+------+-------+------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_train.filter(\"movieId = 148\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------+\n",
      "| id|features|\n",
      "+---+--------+\n",
      "+---+--------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "als_model.itemFactors.filter(\"id = 148\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+--------------------+\n",
      "|userId|     recommendations|\n",
      "+------+--------------------+\n",
      "|    70|[[3153, 6.0009284...|\n",
      "|   190|[[67504, 6.059529...|\n",
      "|   280|[[83318, 5.807171...|\n",
      "|   300|[[59684, 5.281603...|\n",
      "|   340|[[121231, 5.42872...|\n",
      "|   600|[[121231, 5.47754...|\n",
      "|   640|[[5114, 5.328731]...|\n",
      "|   660|[[8530, 5.2560177...|\n",
      "|   161|[[4799, 5.131041]...|\n",
      "|   261|[[8530, 5.838298]...|\n",
      "|   471|[[8530, 4.791874]...|\n",
      "|   581|[[149, 4.6003914]...|\n",
      "|   611|[[121231, 5.71576...|\n",
      "|   641|[[6413, 5.652473]...|\n",
      "|    12|[[4467, 5.6054363...|\n",
      "|   412|[[6413, 5.442573]...|\n",
      "|   452|[[83359, 4.945468...|\n",
      "|   612|[[5114, 4.84929],...|\n",
      "|   662|[[6413, 5.068833]...|\n",
      "|    93|[[83411, 4.972806...|\n",
      "+------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendations_by_user = als_model.recommendForAllUsers(10)\n",
    "recommendations_by_user.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Display the list of movie recommended for the users. If you are interested to see the recommendation for a given user, you can filter the result by userId."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|movieId|userId|recommended_rating|               title|              genres|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|   3153|    70|         6.0009284|7th Voyage of Sin...|Action|Adventure|...|\n",
      "|  26152|    70|          5.956971|       Batman (1966)|Action|Adventure|...|\n",
      "|  83318|    70|         5.9138904|    Goat, The (1921)|              Comedy|\n",
      "|  67504|    70|         5.9138904|Land of Silence a...|         Documentary|\n",
      "|  83411|    70|         5.9138904|         Cops (1922)|              Comedy|\n",
      "|  83359|    70|         5.9138904|Play House, The (...|              Comedy|\n",
      "|   6375|    70|         5.8239217|Gigantic (A Tale ...|         Documentary|\n",
      "|   8580|    70|         5.7976513|Into the Woods (1...|Adventure|Comedy|...|\n",
      "|   2920|    70|          5.790113|Children of Parad...|       Drama|Romance|\n",
      "|   1564|    70|         5.7485266|For Roseanna (Ros...|Comedy|Drama|Romance|\n",
      "|  67504|   190|          6.059529|Land of Silence a...|         Documentary|\n",
      "|  83318|   190|          6.059529|    Goat, The (1921)|              Comedy|\n",
      "|  83411|   190|          6.059529|         Cops (1922)|              Comedy|\n",
      "|  83359|   190|          6.059529|Play House, The (...|              Comedy|\n",
      "|   5114|   190|         5.8549128|Bad and the Beaut...|               Drama|\n",
      "|   8751|   190|          5.651474|Gun Crazy (a.k.a....|Crime|Drama|Film-...|\n",
      "|   2920|   190|         5.5644236|Children of Parad...|       Drama|Romance|\n",
      "|  26152|   190|         5.5176888|       Batman (1966)|Action|Adventure|...|\n",
      "|   7132|   190|          5.478372|Night at the Oper...|Comedy|Musical|Ro...|\n",
      "|   6669|   190|          5.387987|        Ikiru (1952)|               Drama|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendations_by_user_enriched = (recommendations_by_user\n",
    ".withColumn(\"recommendation\", F.explode(\"recommendations\"))\n",
    ".withColumn(\"movieId\", F.expr(\"recommendation.movieId\"))\n",
    ".withColumn(\"recommended_rating\", F.expr(\"recommendation.rating\"))\n",
    ".drop(\"recommendations\")\n",
    ".drop(\"recommendation\")\n",
    ".join(movies, on = \"movieId\")\n",
    ")\n",
    "\n",
    "recommendations_by_user_enriched.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------------------+--------------------+-----------------+\n",
      "|movieId|userId|recommended_rating|               title|           genres|\n",
      "+-------+------+------------------+--------------------+-----------------+\n",
      "|  67504|   547|          5.305828|Land of Silence a...|      Documentary|\n",
      "|  83411|   547|          5.305828|         Cops (1922)|           Comedy|\n",
      "|  83318|   547|          5.305828|    Goat, The (1921)|           Comedy|\n",
      "|  83359|   547|          5.305828|Play House, The (...|           Comedy|\n",
      "|  96075|   547|          5.024853|  Bleak House (2005)|            Drama|\n",
      "|   8123|   547|          5.024853|Sammy and Rosie G...|     Comedy|Drama|\n",
      "|   8261|   547|          5.024853|3 Women (Three Wo...|            Drama|\n",
      "| 101850|   547|          5.024853|Death on the Stai...|Crime|Documentary|\n",
      "| 150856|   547|          5.024853|Making a Murderer...|      Documentary|\n",
      "|  26501|   547|          5.024853|    Choose Me (1984)|   Comedy|Romance|\n",
      "+-------+------+------------------+--------------------+-----------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "recommendations_by_user_enriched.filter(\"userId = 547\").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find most active users, their avg rating and rating standard deviation. We like to see how the list of highly rated (4 or 5 rating) matches up with the recommendation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+------------------+------------+------------------+\n",
      "|userId|        avg_rating|rating_count|        std_rating|\n",
      "+------+------------------+------------+------------------+\n",
      "|   547|  3.18025930572982|        2391|1.1206147892804919|\n",
      "|   564|3.5524625267665955|        1868|1.1718577106638184|\n",
      "|   624| 2.752737752161383|        1735|1.0923146898170415|\n",
      "|    15| 2.458823529411765|        1700|1.3042506789210637|\n",
      "|    73| 3.129192546583851|        1610|0.9950555044812739|\n",
      "|   452| 3.102238805970149|        1340|1.0467037358614122|\n",
      "|   468|2.7064291247095276|        1291| 0.830492820099226|\n",
      "|   380| 3.226716839134525|        1063|0.9399890019033345|\n",
      "|   311| 2.815505397448479|        1019|0.9367601938732544|\n",
      "|    30|3.7636003956478734|        1011|0.9262404017005623|\n",
      "|   294|3.3875395987328405|         947| 0.671665255069752|\n",
      "|   509| 3.293607800650054|         923|0.9299967846980056|\n",
      "|   580| 3.049891540130152|         922|0.8565421667014946|\n",
      "|   213|2.4186813186813185|         910|0.9947510303095398|\n",
      "|   212|2.8881278538812785|         876|0.9399310185512035|\n",
      "|   472|3.7518072289156628|         830|0.9816607190767686|\n",
      "|   388|3.5896464646464645|         792|1.0092086127421807|\n",
      "|    23| 3.371900826446281|         726|0.8956498521951798|\n",
      "|   457|2.2552594670406734|         713|1.0741751323158022|\n",
      "|   518| 3.572842998585573|         707|0.9936046100979975|\n",
      "+------+------------------+------------+------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "rating_count_by_user = (ratings\n",
    " .groupBy(\"userId\")\n",
    " .agg(F.avg(\"rating\").alias(\"avg_rating\")\n",
    "      , F.count(\"*\").alias(\"rating_count\")\n",
    "      , F.stddev(\"rating\").alias(\"std_rating\")\n",
    "     )\n",
    " .orderBy(F.desc(\"rating_count\"))\n",
    ")\n",
    "\n",
    "rating_count_by_user.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(avg(std_rating)=0.9717669109338198)"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rating_count_by_user.selectExpr(\"mean(std_rating)\").first()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Plot the distribution (hist) of the number of ratings by users."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x6180e06d8>"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAD8CAYAAABZ/vJZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEwhJREFUeJzt3X+wXOV93/H3xxjjXzRAdKGKEL3YVZyQTg3qDWXGmdTBicF4EkGnJHg6toaQKDPBiT1NZyI7nZrOlAzp2KbxpEODCxNBbRMc20GtaR2ZuvHkD4MFlfklE2RbMbI0SAnYkDiBgL/9Y891NvLVvatH99zdvff9mtnZc549Z/f7LOfqw3PO2XNSVUiSdLxeMu4CJEnTyQCRJDUxQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktTkpeMu4ESsW7euZmdnx12GJE2V+++//8+rauZE32eqA2R2dpbdu3ePuwxJmipJ/mw53sddWJKkJgaIJKmJASJJamKASJKaGCCSpCYGiCSpiQEiSWpigEiSmhggkqQmU/1L9BMxu/3Tx3xt/w1vXcFKJGk6OQKRJDUxQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktTEAJEkNTFAJElNDBBJUpPeAiTJy5Pcl+RLSR5J8h+69nOT3Jvk8SS/n+RlXfsp3fy+7vXZvmqTJJ24PkcgzwEXV9XrgfOBS5NcBPwWcGNVbQKeBq7plr8GeLqq/jFwY7ecJGlC9RYgNfCX3ezJ3aOAi4E/6Np3AJd301u6ebrX35QkfdUnSToxvR4DSXJSkj3AYWAX8BXgm1X1QrfIAWBDN70BeAKge/1bwPf3WZ8kqV2vAVJVL1bV+cDZwIXADy+0WPe80Gijjm5Isi3J7iS7jxw5snzFSpKOy4qchVVV3wT+L3ARcFqS+RtZnQ0c7KYPABsBute/D3hqgfe6uarmqmpuZmam79IlScfQ51lYM0lO66ZfAfwksBf4HPCvusW2And10zu7ebrX/09Vfc8IRJI0Gfq8pe16YEeSkxgE1Z1V9T+TPArckeQ/Av8PuKVb/hbg9iT7GIw8ruqxNknSCeotQKrqQeCCBdq/yuB4yNHtfwNc2Vc9kqTl5S/RJUlNDBBJUhMDRJLUxACRJDUxQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktTEAJEkNTFAJElNDBBJUhMDRJLUxACRJDUxQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktSktwBJsjHJ55LsTfJIknd17dcl+UaSPd3jsqF13pNkX5LHklzSV22SpBP30h7f+wXg16rqgSSnAvcn2dW9dmNVvX944STnAVcBPwL8APDZJD9YVS/2WKMkqVFvI5CqOlRVD3TTzwJ7gQ2LrLIFuKOqnquqrwH7gAv7qk+SdGJW5BhIklngAuDerumdSR5McmuS07u2DcATQ6sdYPHAkSSNUe8BkuTVwCeAd1fVM8BNwGuB84FDwAfmF11g9Vrg/bYl2Z1k95EjR3qqWpK0lF4DJMnJDMLjI1X1SYCqerKqXqyq7wAf5u92Ux0ANg6tfjZw8Oj3rKqbq2ququZmZmb6LF+StIg+z8IKcAuwt6o+ONS+fmixK4CHu+mdwFVJTklyLrAJuK+v+iRJJ6bPs7DeALwdeCjJnq7tvcDbkpzPYPfUfuCXAKrqkSR3Ao8yOIPrWs/AkqTJ1VuAVNWfsPBxjbsXWed64Pq+apIkLR9/iS5JamKASJKaGCCSpCYGiCSpiQEiSWpigEiSmhggkqQmBogkqYkBIklqYoBIkpoYIJKkJgaIJKmJASJJamKASJKaGCCSpCYGiCSpiQEiSWpigEiSmhggkqQmBogkqYkBIklqMlKAJPknfRciSZouo45A/muS+5L8cpLTeq1IkjQVRgqQqvox4F8DG4HdST6a5KcWWyfJxiSfS7I3ySNJ3tW1n5FkV5LHu+fTu/Yk+VCSfUkeTLL5BPsmSerRyMdAqupx4N8Bvw78C+BDSb6c5F8eY5UXgF+rqh8GLgKuTXIesB24p6o2Afd08wBvATZ1j23ATQ39kSStkFGPgfzTJDcCe4GLgZ/uguFi4MaF1qmqQ1X1QDf9bLfuBmALsKNbbAdweTe9BbitBr4AnJZkfVu3JEl9G3UE8jvAA8Drq+raoWA4yGBUsqgks8AFwL3AWVV1qFv/EHBmt9gG4Imh1Q50bUe/17Yku5PsPnLkyIjlS5KW26gBchnw0ar6a4AkL0nySoCqun2xFZO8GvgE8O6qemaxRRdoq+9pqLq5quaqam5mZmbE8iVJy23UAPks8Iqh+Vd2bYtKcjKD8PhIVX2ya35yftdU93y4az/A4CD9vLOBgyPWJ0laYaMGyMur6i/nZ7rpVy62QpIAtwB7q+qDQy/tBLZ201uBu4ba39GdjXUR8K35XV2SpMnz0hGX+6skm+ePfST5Z8BfL7HOG4C3Aw8l2dO1vRe4AbgzyTXA14Eru9fuZrCrbB/wbeDqkXshSVpxowbIu4GPJ5nfpbQe+LnFVqiqP2Hh4xoAb1pg+QKuHbEeSdKYjRQgVfXFJD8EvI5BKHy5qv6218okSRNt1BEIwI8Cs906FyShqm7rpSpJ0sQbKUCS3A68FtgDvNg1F2CASNIaNeoIZA44rztOIUnSyKfxPgz8wz4LkSRNl1FHIOuAR5PcBzw331hVP9NLVZKkiTdqgFzXZxGSpOkz6mm8f5zkHwGbquqz3XWwTuq3NEnSJBv1cu6/CPwB8Ltd0wbgD/sqSpI0+UY9iH4tg0uTPAPfvbnUmYuuIUla1UYNkOeq6vn5mSQvZYFLrUuS1o5RA+SPk7wXeEV3L/SPA/+jv7IkSZNu1ADZDhwBHgJ+icGVc5e8E6EkafUa9Sys7wAf7h6SJI18LayvsfDtZV+z7BVJkqbC8VwLa97LGdwE6ozlL0eSNC1GOgZSVX8x9PhGVf1n4OKea5MkTbBRd2FtHpp9CYMRyam9VCRJmgqj7sL6wND0C8B+4GeXvRpJ0tQY9Sysn+i7EEnSdBl1F9a/Wez1qvrg8pQjSZoWx3MW1o8CO7v5nwY+DzzRR1GSpMl3PDeU2lxVzwIkuQ74eFX9Ql+FSZIm26iXMjkHeH5o/nlgdrEVktya5HCSh4farkvyjSR7usdlQ6+9J8m+JI8lueQ4+iBJGoNRRyC3A/cl+RSDX6RfAdy2xDq/B/zOAsvdWFXvH25Ich5wFfAjwA8An03yg1X14oj1SZJW2Kg/JLweuBp4GvgmcHVV/eYS63weeGrEOrYAd1TVc1X1NWAfcOGI60qSxmDUXVgArwSeqarfBg4kObfxM9+Z5MFuF9fpXdsG/v4B+QNd2/dIsi3J7iS7jxw50liCJOlEjXpL2/cBvw68p2s6GfjvDZ93E/Ba4HzgEH/3A8UssOyCN6yqqpuraq6q5mZmZhpKkCQth1FHIFcAPwP8FUBVHaThUiZV9WRVvTh0efj53VQHgI1Di54NHDze95ckrZxRA+T5qiq6UUGSV7V8WJL1Q7NXAPNnaO0ErkpySrdrbBNwX8tnSJJWxqhnYd2Z5HeB05L8IvDzLHFzqSQfA94IrEtyAHgf8MYk5zMIov0M7m5IVT2S5E7gUQbX2rrWM7AkabKNei2s93f3Qn8GeB3w76tq1xLrvG2B5lsWWf564PpR6pEkjd+SAZLkJOAzVfWTwKKhIUlaO5Y8BtLtSvp2ku9bgXokSVNi1GMgfwM8lGQX3ZlYAFX1q71UJUmaeKMGyKe7hyRJwBIBkuScqvp6Ve1YqYIkSdNhqWMgfzg/keQTPdciSZoiSwXI8CVGXtNnIZKk6bJUgNQxpiVJa9xSB9Ffn+QZBiORV3TTdPNVVf+g1+okSRNr0QCpqpNWqhBJ0nQ5nvuBSJL0XQaIJKmJASJJamKASJKaGCCSpCYGiCSpiQEiSWpigEiSmhggkqQmBogkqcmoN5RaU2a3L3zvrP03vHWFK5GkyeUIRJLUpLcASXJrksNJHh5qOyPJriSPd8+nd+1J8qEk+5I8mGRzX3VJkpZHnyOQ3wMuPaptO3BPVW0C7unmAd4CbOoe24CbeqxLkrQMeguQqvo88NRRzVuA+fur7wAuH2q/rQa+AJyWZH1ftUmSTtxKHwM5q6oOAXTPZ3btG4AnhpY70LVJkibUpBxEzwJtC95CN8m2JLuT7D5y5EjPZUmSjmWlA+TJ+V1T3fPhrv0AsHFoubOBgwu9QVXdXFVzVTU3MzPTa7GSpGNb6QDZCWztprcCdw21v6M7G+si4Fvzu7okSZOptx8SJvkY8EZgXZIDwPuAG4A7k1wDfB24slv8buAyYB/wbeDqvuqSJC2P3gKkqt52jJfetMCyBVzbVy2SpOU3KQfRJUlTxgCRJDUxQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktTEAJEkNTFAJElNDBBJUhMDRJLUxACRJDXp7XLuq9Hs9k8v2L7/hreucCWSNH6OQCRJTQwQSVITA0SS1MQAkSQ1MUAkSU0MEElSEwNEktTEAJEkNRnLDwmT7AeeBV4EXqiquSRnAL8PzAL7gZ+tqqfHUZ8kaWnjHIH8RFWdX1Vz3fx24J6q2gTc081LkibUJO3C2gLs6KZ3AJePsRZJ0hLGFSAF/FGS+5Ns69rOqqpDAN3zmWOqTZI0gnFdTPENVXUwyZnAriRfHnXFLnC2AZxzzjl91SdJWsJYRiBVdbB7Pgx8CrgQeDLJeoDu+fAx1r25quaqam5mZmalSpYkHWXFAyTJq5KcOj8NvBl4GNgJbO0W2wrctdK1SZJGN45dWGcBn0oy//kfrar/neSLwJ1JrgG+Dlw5htokSSNa8QCpqq8Cr1+g/S+AN610PZKkNpN0Gq8kaYoYIJKkJt4TfRl4r3RJa5EjEElSEwNEktTEAJEkNTFAJElNDBBJUhPPwuqRZ2dJWs0cgUiSmhggkqQmBogkqYkBIklqYoBIkpp4FtYYeHaWpNXAEYgkqYkjkAniyETSNHEEIklqYoBIkpoYIJKkJh4DmWIeM5E0To5AJElNHIFMgWONNCRpnCYuQJJcCvw2cBLw36rqhjGXtGos5y4vd59JmqgASXIS8F+AnwIOAF9MsrOqHh1vZdNlnCMWg0VaOyYqQIALgX1V9VWAJHcAWwADZAymadeZwSWtvEkLkA3AE0PzB4B/PqZa1oxJHLGM6/1XInAmsaaFHG8ot/y3PN73Wov/QzDJ30Wqatw1fFeSK4FLquoXuvm3AxdW1a8MLbMN2NbNvg547Dg/Zh3w58tQ7rSy//Z/Lfcf/A7WAa+qqpkTfaNJG4EcADYOzZ8NHBxeoKpuBm5u/YAku6tqrnX9aWf/7f9a7j/4HXT9n12O95q034F8EdiU5NwkLwOuAnaOuSZJ0gImagRSVS8keSfwGQan8d5aVY+MuSxJ0gImKkAAqupu4O4eP6J599cqYf/XtrXef/A7WLb+T9RBdEnS9Ji0YyCSpCmxZgIkyaVJHkuyL8n2cdfTlyT7kzyUZE+S3V3bGUl2JXm8ez69a0+SD3XfyYNJNo+3+jZJbk1yOMnDQ23H3eckW7vlH0+ydRx9aXGM/l+X5BvddrAnyWVDr72n6/9jSS4Zap/Kv5EkG5N8LsneJI8keVfXvia2gUX63/82UFWr/sHggPxXgNcALwO+BJw37rp66ut+YN1Rbf8J2N5Nbwd+q5u+DPhfQICLgHvHXX9jn38c2Aw83Npn4Azgq93z6d306ePu2wn0/zrg3y6w7Hnd9n8KcG73d3HSNP+NAOuBzd30qcCfdv1cE9vAIv3vfRtYKyOQ714ipaqeB+YvkbJWbAF2dNM7gMuH2m+rgS8ApyVZP44CT0RVfR546qjm4+3zJcCuqnqqqp4GdgGX9l/9iTtG/49lC3BHVT1XVV8D9jH4+5jav5GqOlRVD3TTzwJ7GVzVYk1sA4v0/1iWbRtYKwGy0CVSFvuCp1kBf5Tk/u5X+wBnVdUhGGxswJld+2r+Xo63z6vxu3hnt4vm1vndN6zy/ieZBS4A7mUNbgNH9R963gbWSoBkgbbVevrZG6pqM/AW4NokP77Ismvpe5l3rD6vtu/iJuC1wPnAIeADXfuq7X+SVwOfAN5dVc8stugCbVP/HSzQ/963gbUSIEteImW1qKqD3fNh4FMMhqVPzu+a6p4Pd4uv5u/lePu8qr6Lqnqyql6squ8AH2awHcAq7X+Skxn84/mRqvpk17xmtoGF+r8S28BaCZA1cYmUJK9Kcur8NPBm4GEGfZ0/o2QrcFc3vRN4R3dWykXAt+aH/KvA8fb5M8Cbk5zeDfXf3LVNpaOOZV3BYDuAQf+vSnJKknOBTcB9TPHfSJIAtwB7q+qDQy+tiW3gWP1fkW1g3GcQrNSDwZkXf8rgLIPfGHc9PfXxNQzOnPgS8Mh8P4HvB+4BHu+ez+jaw+AGXl8BHgLmxt2Hxn5/jMEQ/W8Z/F/UNS19Bn6ewQHFfcDV4+7XCfb/9q5/D3b/CKwfWv43uv4/BrxlqH0q/0aAH2Owq+VBYE/3uGytbAOL9L/3bcBfokuSmqyVXViSpGVmgEiSmhggkqQmBogkqYkBIklqYoBIkpoYIJKkJgaIJKnJ/wezpjqNpJH8AwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x6180e8630>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "rating_count_by_user.select(\"rating_count\").toPandas()[\"rating_count\"].plot.hist(bins = 50)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See movie recommendations for a given user. See the recommendations for one of most active users."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|movieId|userId|recommended_rating|               title|              genres|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|  83318|   547|          5.305828|    Goat, The (1921)|              Comedy|\n",
      "|  83359|   547|          5.305828|Play House, The (...|              Comedy|\n",
      "|  67504|   547|          5.305828|Land of Silence a...|         Documentary|\n",
      "|  83411|   547|          5.305828|         Cops (1922)|              Comedy|\n",
      "|   1180|   547|         4.5169625| Hear My Song (1991)|              Comedy|\n",
      "|  86504|   547|          4.419376|Voices from the L...|         Documentary|\n",
      "|   8751|   547|         4.3445444|Gun Crazy (a.k.a....|Crime|Drama|Film-...|\n",
      "|   4427|   547|         4.2814674|Lion in Winter, T...|               Drama|\n",
      "|  56715|   547|          4.275058|Wristcutters: A L...|Drama|Fantasy|Rom...|\n",
      "|   6918|   547|         4.2680645|Unvanquished, The...|               Drama|\n",
      "|   2099|   547|          4.267149|Song of the South...|Adventure|Animati...|\n",
      "|   8609|   547|          4.244663|Our Hospitality (...|              Comedy|\n",
      "|  25764|   547|          4.244663|Cameraman, The (1...|Comedy|Drama|Romance|\n",
      "|  71650|   547|          4.244663|Werner Herzog Eat...|         Documentary|\n",
      "|  50259|   547|          4.244663|      Old Joy (2006)|               Drama|\n",
      "|  72647|   547|          4.244663| Zorn's Lemma (1970)|               Drama|\n",
      "|  66019|   547|          4.244663|Great Ecstasy of ...|         Documentary|\n",
      "|   7309|   547|          4.244663|Black Pirate, The...|    Action|Adventure|\n",
      "|   3232|   547|          4.244663|Seven Chances (1925)|              Comedy|\n",
      "|  65638|   547|          4.244663|        Aspen (1991)|         Documentary|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def reco_for_user(userId): \n",
    "    return (\n",
    "            als_model\n",
    "             .recommendForUserSubset(df_train.filter(F.col(\"userId\") == userId), 500)\n",
    "             .withColumn(\"recommendation\", F.explode(\"recommendations\"))\n",
    "             .withColumn(\"movieId\", F.expr(\"recommendation.movieId\"))\n",
    "             .withColumn(\"recommended_rating\", F.expr(\"recommendation.rating\"))\n",
    "             .drop(\"recommendations\")\n",
    "             .drop(\"recommendation\")\n",
    "             .join(ratings.filter(F.col(\"userId\") == userId), on = \"movieId\", how = \"leftanti\")\n",
    "             .join(movies, on = \"movieId\")\n",
    "             .orderBy(F.desc(\"recommended_rating\"))\n",
    "            )\n",
    "reco_for_user(547).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find a few users who have moderate number of ratings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+----------+------------+------------------+\n",
      "|userId|avg_rating|rating_count|        std_rating|\n",
      "+------+----------+------------+------------------+\n",
      "|   637|      3.96|          25|1.0198039027185568|\n",
      "|   331|      3.48|          25|1.2948616399703357|\n",
      "|   375|      3.56|          25| 1.260952021291849|\n",
      "|   377|      3.12|          25|0.9273618495495702|\n",
      "|    44|      3.36|          25|0.8602325267042625|\n",
      "|   114|      4.08|          25|1.0376254944182253|\n",
      "|   100|       3.4|          25|0.7071067811865476|\n",
      "|   269|       3.4|          25|1.0801234497346432|\n",
      "|   446|      4.44|          25|0.5066228051190221|\n",
      "|   556|       4.4|          25|0.7637626158259734|\n",
      "|   538|       3.6|          25|1.0408329997330665|\n",
      "|   392|      2.92|          25|1.1150485789118487|\n",
      "|   116|       3.8|          25| 1.384437310486346|\n",
      "|   495|      4.08|          25|0.7593857166596345|\n",
      "+------+----------+------------+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "rating_count_by_user.filter(\"rating_count = 25\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|movieId|userId|recommended_rating|               title|              genres|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "|  83318|   100|          5.399297|    Goat, The (1921)|              Comedy|\n",
      "|  83359|   100|          5.399297|Play House, The (...|              Comedy|\n",
      "|  67504|   100|          5.399297|Land of Silence a...|         Documentary|\n",
      "|  83411|   100|          5.399297|         Cops (1922)|              Comedy|\n",
      "|   5114|   100|         5.2913103|Bad and the Beaut...|               Drama|\n",
      "|   2202|   100|         4.9161224|     Lifeboat (1944)|           Drama|War|\n",
      "|   7087|   100|         4.9139767|Passage to India,...|     Adventure|Drama|\n",
      "|   3989|   100|         4.8830647|One Day in Septem...|         Documentary|\n",
      "|   2920|   100|         4.8446193|Children of Parad...|       Drama|Romance|\n",
      "|   8264|   100|         4.8080645| Grey Gardens (1975)|         Documentary|\n",
      "|    994|   100|          4.786193|    Big Night (1996)|        Comedy|Drama|\n",
      "|   3730|   100|         4.7740874|Conversation, The...|       Drama|Mystery|\n",
      "|   7096|   100|         4.7552195|Rivers and Tides ...|         Documentary|\n",
      "|   1564|   100|         4.7492228|For Roseanna (Ros...|Comedy|Drama|Romance|\n",
      "|   7132|   100|          4.742795|Night at the Oper...|Comedy|Musical|Ro...|\n",
      "|   7767|   100|          4.739683|Best of Youth, Th...|               Drama|\n",
      "|   8121|   100|          4.738728|Seducing Doctor L...|              Comedy|\n",
      "|  47728|   100|          4.738728|Green for Danger ...|       Crime|Mystery|\n",
      "|  25852|   100|          4.738728|     Gaslight (1940)|    Mystery|Thriller|\n",
      "|   4612|   100|          4.738728|Jesus of Montreal...|               Drama|\n",
      "+-------+------+------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "reco_for_user(100).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate the recommendations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "One more let's see the recommendations for the user movie combination in the df_test dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------+-------+------+----------+----------+\n",
      "|userId|movieId|rating| timestamp|prediction|\n",
      "+------+-------+------+----------+----------+\n",
      "|   157|     12|     2|1291598164| 2.2820864|\n",
      "|   564|     12|     1| 974709821| 3.0669582|\n",
      "|    91|     12|     3|1448813887| 3.8011193|\n",
      "|   624|     12|     2|1019563753| 2.0999196|\n",
      "|   636|     18|     3| 855227364| 3.5984566|\n",
      "|   616|     18|     4| 860573132| 3.3177824|\n",
      "|   571|     18|     5|1334342436| 2.6853292|\n",
      "|   135|     18|     4| 844996129| 3.0119503|\n",
      "|   255|     18|     2|1236980522|   2.22563|\n",
      "|   461|     18|     1|1091959887| 2.5870395|\n",
      "|   408|     18|     5| 933116210| 2.7201877|\n",
      "|   507|     18|     4| 862091839| 3.9740381|\n",
      "|   177|     18|     4| 907380994| 3.8975425|\n",
      "|    30|     18|     2| 945277971|   3.56654|\n",
      "|   655|     18|     4|1470073389| 3.5001454|\n",
      "|   165|     70|     5|1111480089| 2.4006498|\n",
      "|   358|     70|     1| 957534713| 2.4248333|\n",
      "|    34|     70|     4| 973746231| 3.1537461|\n",
      "|   580|     70|     2|1165292373| 2.4286094|\n",
      "|   647|     70|     3| 947292818| 2.8044093|\n",
      "+------+-------+------+----------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "als_model.transform(df_test).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find root mean squared error based on the df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|         test_rmse|\n",
      "+------------------+\n",
      "|0.9751612132040609|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "(als_model\n",
    ".transform(df_test)\n",
    ".filter(\"not isnan(prediction)\")\n",
    ".selectExpr(\"sqrt(avg(pow((rating - prediction), 2))) test_rmse\")\n",
    ").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the rmse for the training data. It is expected, the rmse for the training dataset would be better than the test dataset. Lower the value of RMSE, better it is."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|        train_rmse|\n",
      "+------------------+\n",
      "|0.5988259423059014|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "(als_model\n",
    ".transform(df_train)\n",
    ".filter(\"not isnan(prediction)\")\n",
    ".selectExpr(\"sqrt(avg(pow((rating - prediction), 2))) train_rmse\")\n",
    ").show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification Models for recommendations\n",
    "\n",
    "Idea is that based on the features of movies and users, can we estimated what will be the rating. \n",
    "\n",
    "Example of features of the movies \n",
    "- year of release\n",
    "- number users who have rated high (4 or 5)\n",
    "- number of ratings (usually for these of high magnitude, take a log of the count)\n",
    "- associated genre \n",
    "- genre properties such as number avg rating by the genre.\n",
    "\n",
    "Features by users \n",
    "- how many movies the user has rated high \n",
    "- standard deviation of his/her rating"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let create a dataset that we can filter by genre."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+--------------------+--------------------+----+---------+------+------+-------------------+\n",
      "|movieId|               title|              genres|year|    genre|userId|rating|          timestamp|\n",
      "+-------+--------------------+--------------------+----+---------+------+------+-------------------+\n",
      "|     31|Dangerous Minds (...|             [Drama]|1995|    Drama|     1|     2|2009-12-14 08:22:24|\n",
      "|   1029|        Dumbo (1941)|[Animation, Child...|1941|  Musical|     1|     3|2009-12-14 08:22:59|\n",
      "|   1029|        Dumbo (1941)|[Animation, Child...|1941|    Drama|     1|     3|2009-12-14 08:22:59|\n",
      "|   1029|        Dumbo (1941)|[Animation, Child...|1941| Children|     1|     3|2009-12-14 08:22:59|\n",
      "|   1029|        Dumbo (1941)|[Animation, Child...|1941|Animation|     1|     3|2009-12-14 08:22:59|\n",
      "|   1061|     Sleepers (1996)|          [Thriller]|1996| Thriller|     1|     3|2009-12-14 08:23:02|\n",
      "|   1129|Escape from New Y...|[Action, Adventur...|1981| Thriller|     1|     2|2009-12-14 08:23:05|\n",
      "|   1129|Escape from New Y...|[Action, Adventur...|1981|   Sci-Fi|     1|     2|2009-12-14 08:23:05|\n",
      "|   1129|Escape from New Y...|[Action, Adventur...|1981|Adventure|     1|     2|2009-12-14 08:23:05|\n",
      "|   1129|Escape from New Y...|[Action, Adventur...|1981|   Action|     1|     2|2009-12-14 08:23:05|\n",
      "|   1172|Cinema Paradiso (...|             [Drama]|1989|    Drama|     1|     4|2009-12-14 08:23:25|\n",
      "|   1263|Deer Hunter, The ...|        [Drama, War]|1978|      War|     1|     2|2009-12-14 08:22:31|\n",
      "|   1263|Deer Hunter, The ...|        [Drama, War]|1978|    Drama|     1|     2|2009-12-14 08:22:31|\n",
      "|   1287|      Ben-Hur (1959)|[Action, Adventur...|1959|    Drama|     1|     2|2009-12-14 08:23:07|\n",
      "|   1287|      Ben-Hur (1959)|[Action, Adventur...|1959|Adventure|     1|     2|2009-12-14 08:23:07|\n",
      "|   1287|      Ben-Hur (1959)|[Action, Adventur...|1959|   Action|     1|     2|2009-12-14 08:23:07|\n",
      "|   1293|       Gandhi (1982)|             [Drama]|1982|    Drama|     1|     2|2009-12-14 08:22:28|\n",
      "|   1339|Dracula (Bram Sto...|[Fantasy, Horror,...|1992| Thriller|     1|     3|2009-12-14 08:22:05|\n",
      "|   1339|Dracula (Bram Sto...|[Fantasy, Horror,...|1992|  Romance|     1|     3|2009-12-14 08:22:05|\n",
      "|   1339|Dracula (Bram Sto...|[Fantasy, Horror,...|1992|   Horror|     1|     3|2009-12-14 08:22:05|\n",
      "+-------+--------------------+--------------------+----+---------+------+------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "genre_rating = (movies\n",
    ".withColumn(\"year\", F.regexp_extract(\"title\", r\"(\\d+)\", 1))\n",
    ".withColumn(\"genres\", F.split(\"genres\", \"\\|\"))\n",
    ".withColumn(\"genre\", F.explode(\"genres\"))\n",
    ".join(ratings, on = \"movieId\")\n",
    ".withColumn(\"timestamp\", F.expr(\"from_unixtime(timestamp)\"))\n",
    ")\n",
    "\n",
    "genre_rating.show() "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of unique values of the genres"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(count(DISTINCT genre)=20)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genre_rating.selectExpr(\"count(distinct genre)\").first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x6181d1400>"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAFYCAYAAACoOrwdAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3Xm4HFWdxvHvS4gsIqBwFQRCFEFFlC0i2yiLC4rAjIKAoohLHBUBRR1xAUSdcRmXURSMgkZk2NUJDruACsiSQCAsUSLokEGHsIMgGPjNH+d00un0vV3VVfd2W7yf5+nndldXnT7dt+vXp86qiMDMzJplhUFnwMzM6ufgbmbWQA7uZmYN5OBuZtZADu5mZg3k4G5m1kAO7mZmDVQ4uEuaJOk6ST/v8txKkk6TtEDSVZKm1plJMzMrp0zJ/VDgllGeezdwX0S8APg68KWqGTMzs/6tWGQnSesDuwNfAD7SZZe9gKPz/TOBYyUpxhj+uvbaa8fUqVNLZdbM7Kluzpw5d0fESK/9CgV34BvAx4FnjPL8esAdABGxWNIDwFrA3aMlOHXqVGbPnl3w5c3MDEDSH4vs17NaRtIbgbsiYs5Yu3XZtlypXdJ0SbMlzV60aFGR/JmZWR+K1LnvAOwp6Q/AqcAukn7csc9CYAMASSsCawD3diYUETMiYlpETBsZ6XlVYWZmfeoZ3CPiiIhYPyKmAvsBF0fEAR27zQIOzPf3zvt4ukkzswEpWue+HEnHALMjYhZwAnCSpAWkEvt+NeXPzMz6UCq4R8SlwKX5/pFt2/8K7FNnxszMrH8eoWpm1kAO7mZmDeTgbmbWQA7uZmYN1HdvGbOnkqmf+O8xn//DF3efoJyYFeOSu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZC7QprZhPr2P1/cc58PHr/LBOSk2VxyNzNrIJfch5BLNmZWlUvuZmYN5OBuZtZArpYxMxuQX1y8Uc99dt3l932l7ZK7mVkD9QzuklaWdLWk6yXdJOmzXfZ5p6RFkubm23vGJ7tmZlZEkWqZx4BdIuJhSZOByySdGxFXdux3WkQcXH8WzZrB0wYPl4Wf+PWYz6//xX+YoJyMj57BPSICeDg/nJxvMZ6ZMjMby1f3fWPPfQ4/7ecTkJPhVajOXdIkSXOBu4ALI+KqLru9WdINks6UtEGtuTQzs1IK9ZaJiCeALSStCfxU0mYRcWPbLmcDp0TEY5L+GZgJLDfKRtJ0YDrAlClTKmfe7Cnl6DUK7PPA+OfDljj66KMrPT+eSvWWiYj7gUuB3Tq23xMRj+WH3wO2HuX4GRExLSKmjYyM9JFdMzMrokhvmZFcYkfSKsCrgfkd+6zb9nBP4JY6M2lmZuUUqZZZF5gpaRLpx+D0iPi5pGOA2RExCzhE0p7AYuBe4J3jleEx9bpsLXDJ+tKZLx3z+XkHziuTIzOzgSjSW+YGYMsu249su38EcES9WTMzs355hKqZWQN5bhkbVR2DPKr2JhjPuTesP7e86MVjPv/i+W5yGwYuuZuZNZCDu5lZA7lapqF6Dc9+qg/NNms6B/dx4DrJ4bLOJXPHfP7PO28xQTkZPHf1fepwtYyZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTXQ0HSF7LW+JHiNSTOzolxyNzNrIAd3M7MGcnA3M2sgB3czswZycDcza6AiC2SvLOlqSddLuknSZ7vss5Kk0yQtkHSVpKnjkVkzMyumSMn9MWCXiNgc2ALYTdK2Hfu8G7gvIl4AfB34Ur3ZNDOzMnoG90gezg8n51t07LYXMDPfPxPYVZJqy6WZmZVSqM5d0iRJc4G7gAsj4qqOXdYD7gCIiMXAA8BadWbUzMyKKxTcI+KJiNgCWB/YRtJmHbt0K6V3lu6RNF3SbEmzFy1aVD63ZmZWSKneMhFxP3ApsFvHUwuBDQAkrQisAdzb5fgZETEtIqaNjIz0lWEzM+utSG+ZEUlr5vurAK8G5nfsNgs4MN/fG7g4IpYruZuZ2cQoMnHYusBMSZNIPwanR8TPJR0DzI6IWcAJwEmSFpBK7PuNW47NzKynnsE9Im4Atuyy/ci2+38F9qk3a2Zm1i+PUDUzayAHdzOzBnJwNzNrIAd3M7MGcnA3M2sgB3czswZycDczayAHdzOzBnJwNzNrIAd3M7MGcnA3M2sgB3czswZycDczayAHdzOzBnJwNzNrIAd3M7MGcnA3M2sgB3czswYqskD2BpIukXSLpJskHdpln50kPSBpbr4d2S0tMzObGEUWyF4MHB4R10p6BjBH0oURcXPHfr+OiDfWn0UzMyurZ8k9Iv4UEdfm+w8BtwDrjXfGzMysf6Xq3CVNBbYErury9HaSrpd0rqSX1JA3MzPrU5FqGQAkrQacBRwWEQ92PH0tsGFEPCzpDcDPgI27pDEdmA4wZcqUvjNtZmZjK1RylzSZFNhPjoifdD4fEQ9GxMP5/jnAZElrd9lvRkRMi4hpIyMjFbNuZmajKdJbRsAJwC0R8bVR9lkn74ekbXK699SZUTMzK65ItcwOwNuBeZLm5m2fBKYARMTxwN7A+yUtBh4F9ouIGIf8mplZAT2De0RcBqjHPscCx9aVKTMzq8YjVM3MGsjB3cysgRzczcwayMHdzKyBHNzNzBrIwd3MrIEc3M3MGsjB3cysgRzczcwayMHdzKyBHNzNzBrIwd3MrIEc3M3MGsjB3cysgRzczcwayMHdzKyBHNzNzBrIwd3MrIGKLJC9gaRLJN0i6SZJh3bZR5K+KWmBpBskbTU+2TUzsyKKLJC9GDg8Iq6V9AxgjqQLI+Lmtn1eD2ycb68Ajst/zcxsAHqW3CPiTxFxbb7/EHALsF7HbnsBP4rkSmBNSevWnlszMyukVJ27pKnAlsBVHU+tB9zR9nghy/8AmJnZBCkc3CWtBpwFHBYRD3Y+3eWQ6JLGdEmzJc1etGhRuZyamVlhhYK7pMmkwH5yRPykyy4LgQ3aHq8P3Nm5U0TMiIhpETFtZGSkn/yamVkBRXrLCDgBuCUivjbKbrOAd+ReM9sCD0TEn2rMp5mZlVCkt8wOwNuBeZLm5m2fBKYARMTxwDnAG4AFwCPAQfVn1czMiuoZ3CPiMrrXqbfvE8AH68qUmZlV4xGqZmYN5OBuZtZADu5mZg3k4G5m1kAO7mZmDeTgbmbWQA7uZmYN5OBuZtZADu5mZg3k4G5m1kAO7mZmDeTgbmbWQA7uZmYN5OBuZtZADu5mZg3k4G5m1kAO7mZmDeTgbmbWQEUWyD5R0l2Sbhzl+Z0kPSBpbr4dWX82zcysjCILZP8QOBb40Rj7/Doi3lhLjszMrLKeJfeI+BVw7wTkxczMalJXnft2kq6XdK6kl9SUppmZ9alItUwv1wIbRsTDkt4A/AzYuNuOkqYD0wGmTJlSw0ubmVk3lUvuEfFgRDyc758DTJa09ij7zoiIaRExbWRkpOpLm5nZKCoHd0nrSFK+v01O856q6ZqZWf96VstIOgXYCVhb0kLgKGAyQEQcD+wNvF/SYuBRYL+IiHHLsZmZ9dQzuEfE/j2eP5bUVdLMzIaER6iamTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkD9Qzukk6UdJekG0d5XpK+KWmBpBskbVV/Ns3MrIwiJfcfAruN8fzrgY3zbTpwXPVsmZlZFT2De0T8Crh3jF32An4UyZXAmpLWrSuDZmZWXh117usBd7Q9Xpi3mZnZgNQR3NVlW3TdUZouabak2YsWLarhpc3MrJs6gvtCYIO2x+sDd3bbMSJmRMS0iJg2MjJSw0ubmVk3dQT3WcA7cq+ZbYEHIuJPNaRrZmZ9WrHXDpJOAXYC1pa0EDgKmAwQEccD5wBvABYAjwAHjVdmzcysmJ7BPSL27/F8AB+sLUdmZlaZR6iamTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDFQruknaT9FtJCyR9osvz75S0SNLcfHtP/Vk1M7OiiiyQPQn4NvAaYCFwjaRZEXFzx66nRcTB45BHMzMrqUjJfRtgQUTcFhGPA6cCe41vtszMrIoiwX094I62xwvztk5vlnSDpDMlbVBL7szMrC9Fgru6bIuOx2cDUyPiZcBFwMyuCUnTJc2WNHvRokXlcmpmZoUVCe4LgfaS+PrAne07RMQ9EfFYfvg9YOtuCUXEjIiYFhHTRkZG+smvmZkVUCS4XwNsLOl5kp4G7AfMat9B0rptD/cEbqkvi2ZmVlbP3jIRsVjSwcD5wCTgxIi4SdIxwOyImAUcImlPYDFwL/DOccyzmZn10DO4A0TEOcA5HduObLt/BHBEvVkzM7N+eYSqmVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1kIO7mVkDObibmTWQg7uZWQM5uJuZNZCDu5lZAzm4m5k1UKHgLmk3Sb+VtEDSJ7o8v5Kk0/LzV0maWndGzcysuJ7BXdIk4NvA64FNgf0lbdqx27uB+yLiBcDXgS/VnVEzMyuuSMl9G2BBRNwWEY8DpwJ7deyzFzAz3z8T2FWS6summZmVUSS4rwfc0fZ4Yd7WdZ+IWAw8AKxVRwbNzKw8RcTYO0j7AK+LiPfkx28HtomID7Xtc1PeZ2F+/Pu8zz0daU0HpueHLwR+2yN/awN3F387tR/fpDSGIQ91pDEMeRiWNIYhD8OSxjDkYaLS2DAiRnqmEhFj3oDtgPPbHh8BHNGxz/nAdvn+ijlj6pV2gdeePcjjm5TGMOTB78OfhT+LiUkjIgpVy1wDbCzpeZKeBuwHzOrYZxZwYL6/N3Bx5FyamdnEW7HXDhGxWNLBpNL5JODEiLhJ0jGkX5hZwAnASZIWAPeSfgDMzGxAegZ3gIg4BzinY9uRbff/CuxTb9YAmDHg45uUxjDkoY40hiEPw5LGMORhWNIYhjwMUxq9G1TNzOzvj6cfMDNrIAd3M7MGcnC3oSVpBUlvGXQ+YEleth90PppAyQaDzsewyN+tLSXtLmkXSc+pI92hCu6SJkm6aND5aJH09D6PO1jSM2t4/TdKqvQ/yp/pcyVNad3KHl/htQ/vdryktSSd0Ov4iHgSOLjf169TzstX60ir3+/VsMjB+QBJR+bHUyRtU/T43E36ZzXko/L5UYd+zzFJG0maASwAvgjsD3wAuFDSlZIOqvL+Bv7BtIuIJ4BHJK3RbxqSdpB0oaTfSbpN0u2SbiuZxvaSbgZuyY83l/SdEkmsA1wj6fQ8o2a/8+zsB9wq6cuSXlz2YEkfAv4PuBD473z7eclkFkj6SpfJ4op4ITBH0g5tefoAMBuYVzCNCyV9VNIGkp7VupXJhKQRSZ+UNEPSia1bmTSyCyS9ud//Z5XvlaSzJc0a7VYwjcvy34ckPdh2e0jSgyXeyndIgxv3z48fIk0uWMaVkl5e8phOlc4PAEkbSzpT0s05XtxWJl5UPMc+D/wY2CgiXhcRB0TE3hHxMmBPYA3g7aXeUHvehq23jKTTgW1JH9ZfWtsj4pCCx88HPgzMAZ5oO/6eUQ9aPo2rSIOxZkXElnnbjRGxWYk0BLwWOAiYBpwOnBARvy+aRk5nddJJdBAQwA+AUyLioQLHLgBeUea9d0njGaST6CBSYeBE4NSIKBQMclXGscBNwIuAW4HDI+JPBY+/vcvmiIjnFzk+p3EF8GuW/06cVTSNnM5DwNOBxcBfAeW8rF7w+L6/V5JeNdbzEfHLInmog6RrI2IrSde1vY/rI2LzEmncDGwC/JF0nrc+y5eVzEvf50c+/jLgKNJstnvkdBQRRxU8vvI5Nl4K9XOfYK1fv349EBHnVs1ERNzRUUB7YrR9Rzk+JP0Z+DMpGDwTOFPShRHx8RLpPCjpLGAV4DDgn4CPSfpmRHyrx+F3kCZx61s+Sb4HfE/SK4FTgK9LOhP4XEQs6JHEjaRRzruRTuDCgT2//vP6y/kyVo2If6maSEQ8o4Y0+vpe1RG8Jb0pIn6S7z8zIu7rM6m/5eq2yGmNAE+WTOP1fb72MiqeHwCrRMQvJCki/ggcLenXpIBfRN/nmKQ3jfV863/Vr6EL7hExU2mag03ypt9GxN9KJHGJpK8APwEea0v32hJp3JFLnJHzcgj5UroISYeQpmO4G/g+8LGI+FuuP7sVKBTcJe1JKklsBJxEmoztLkmr5vz0+vLeBlwq6b9Z9rP4Won3MgnYPedjKqne+WTgH0gD2zYZ49gDgGOA7+b3sDnwbUm/Az4aEXcVeP1VgY8AUyJiuqSNgRdGRJnqpZ9LekMejFeapBdFxHxJW3V7vsR3q9L3KudlY+DfSGsrrNyWhyJXMp8mnRcAvwC6vp8Cvgn8FHi2pC+QrkY+XSaBHEiR9Gza3kcZNZwfAH9tnZdKI/H/F3h2iWxUOcf2yH+fDWwPXJwf7wxcytL/VV+GLrhL2ok0N/wfSCW9DSQdGBG/KpjEK/LfaW3bAtilRDb+GfgP0lTGC4ELgA+WOH5t4E2tL/CSTEQ8KemNJdJ5M/D1zvceEY9IeleB4/8n356Wb/24FbgE+EpEXNG2/cxckh/LPsDObZ/DHEnbkT7fK4EiAekHpOqUVk+VhcAZlGs7OBT4pKTHgVZBoXB1CunHZTrdG1TLfLeqfq8gfR6taoSdydUIBY/VKPdLiYiTJc0Bds3p/GNElP2R2pP0eT4XuAvYkBSQX1IimarnB6TS/qqkH9rPkf6XB455xLL6Psci4iAAST8HNm1d0Upal/JtGF1fYKhupBP5hW2PNwHmFDx2BeAtA8z7s8a6lUxrEnBRTfl6eoVjVxunz2pKwf1m57/XtW27flD/40HfWucCMK9t268LHjsf2BLYmhRItySV3rcCtiqYxgrAjTW8j+tJaz5clx/vDMwocXxt50dN/5dn9HuudH6edX3GQ1dyByZHxJJ53iPid5ImFzkwUsn4YFLjZd8kzQQOjYj78+NnAl+NiF6lgTmkkpyAKcB9+f6apF/3wvXHEfGEpEckrRER/dbpbUea1G01YIqkzYH3RcQHSiRzpKTPA48C55GqVg6LiB/3kZ81SKWttwIvZvlFX7p5XNIqLK3f3Yi2y98Sr70n0LrSuDTKVeu00pgMvL89HeC7UbDaUNLzgA+RqreWnHsRsWeJbFSpRvgT0Kou+HPbfSh4BZLPseslTYmI/ymR705/i4h7lPp4rxARl0gqvDxnHecHgKRpwKdIVw7t/5NCDbuSNiNVCT0rP74beEdE3FQiG5dKOp/UnhWkDgyXlDi+q2EM7rOV+kCflB+/jRQ0i7pQ0keB01i2t829JdJ4WSuw52Pvk7Rlr4MiN/5JOp7UI+Kc/Pj1wKtLvH7LX4F5kvrqOQR8A3gdeYrmiLi+QFVKp9dGxMcl/ROpKmEf0hevUHDPgXlPUkDfilTC+UegaDXbUaQflQ0knQzsALyzzBuQ9EXg5aS2AoBDJe0YEcst9t7DccBkUldASN3UjgPeU/D4n5F+bM+mfANkS2c1ws4UrEaIiJ37fM1O6wI3SbqaZb+XZX6k7pe0GqkX08mS7iJ1PCij6vkB6TvxMVLX3H7+JzOAj0TEJbCkWvl7LK1G7CkiDs7nV+vcnBERP+0jL8sYxq6QK5HqIXcklXp/BXwnIgqV1mrqOnc9sFPk3gRK/ap/GREvLXj8nIjYumPb7IiYNtoxo6TT9aSNiJndtnc5/qqIeEXFLms3RcRLJH0POCsiziuaRg7GryTVLZ9KajBaECV7wEhai9Q9VsCVEVFqpRtJNwBbRBqI1Gokvq5o6awtneXed5nPs/X/KPOa403SjIiY3nvPZY7p2i0zCvTokXQYcDmpWugRUhXE20h9uk+Ocl2WK50fOY3LImLHovt3Ob7Sd6LtmA2BjSPiotwgPCkKducczVCV3PNJd0JEHMCyl4yFlQ0co/gqcEXu7geptPqFEsffLenTpNJtAAcApfvBlvmSjqJy7wzgbKWxA48CH8jd3v5a8NjNSFVTtwDz86V0odJEl54pre6TU3KVQJneT5CqxlpXb/0OkntC0kaRxypIej7lusj+h6SjSD92ffXkyqXUfTqqDE+NiNeVyEe7UgUOqNwtc31So/KLgBuAK0jB/uySV9dE6lm3Cqn9pteSnaM5StL3Sb2H2v8nRXuq3CbpMyytaTgA6FbAHJWk95Ia7J9F6vmzHnA8qcG6b8NYcj8f2CMiHu/z+Dq6ziHpJaRLXgG/iIibSxz7LFJ1witJwf1XwDFlv7wVu70haW3SifRq0vu4gNSWUOqHJgeQB3NwXhVYPSL+XPDYF5GqZPYl9Yp4EfDSXsdLGqvOMSKicO8nSfuThndfQvocXklaKvLUomnkdHYl9Va5LaezIXBQ65K8wPH/RqrK+T1LqwDKvpclV2FjbSuR3nkRsVvJYx4it4GQeohMBv4SxXsfkQsb00jVF9vl2/0RUXgktKQ9gH8HnhYRz5O0Bek8K1w9JOnHpO/kTSz7PynU2yafG59l2ZqGo6PEGAJJc4FtgKvarrDnFa0pGM1QldyzPwCXKw2pbq9HK1qSr6PrHKSeBfeRP6MyDUg5iB8qabWIeLjk67bru9tbvgp6e0S8rcLrt6wHvEZSe3/kHxU5MCLmA0eSGmankQL91ZIWRsSo9ZI11g8TEadIupRU7y7gX4r+OHWk84tWYSGnM79odWH2T8Dz+y24ZE+2fxfz5XzfJbSygT0fs8xgLkn/SApOZawCrE66iloDuJPiU1K0HJ1f99Kcr7m50bqMzasE0RzEy9Txd/NYRDyuPLhN0opU+J+2DGNwvzPfViA1vpW1UUTsm0trRMSjUrm5QJTmiziKNGfEE+Sh0UDRFvTtSYOXqvRSgQqj53Ipey/SD0PfcjXCTqSrh3NIIwsvo2Bw78jTbFKD+eEsbTwa7XUrj97T8oOPFua/z5X03KLVIWM0Qr9CElF8DMb1pOqhnoO3xvAp4DJJraqRV5Iu6XuS9I2IOEzS2XQJHiUbRNuP+5mkQo3TShNlvYQ0H81VpGqZr5Up6bZZHBEPdJzeZYPilZI2LXNlDrV/lr+U9ElgFUmvIU0ednaZ/HQzVME9lzZXi4iPVUimjq5zh5KqcvqdL+LrVO+lAtVHz10u6ViW7zlUpr56b1L3x+si4iCl6Ui/X+RASd/ssctYdbd7jPFcUGz0Xl2Dj7p9H4P0uaxP6nNdxHOA+ZKuYdn63cKBIDdob8XSBuYPl2hgbtUL/3vR1+um44d3BVL1StGgOgVYiTQ47n9JP7j3j3nE6G6U9FZgUr6iOoT0Y1HGjsCBSh0xHoPCc9zU8llmnwDeTbpyeR9wTkR8r2qiw1jn/ouI6LshQdJrSaWbTUl1zDtQol40p3EJ8JqIKNs1q3V85V4q+ZiXkxoj1yR1e1sd+HJEXFXw+G7vuWwd79URsY3SiMSdSSWuGyOi50hCpRGhN5LGHdxJR5VSDQ3GhUhaOdI6v2NuK5HejqTv2DOBL0REoVKWqvUyqWsKBJSmHH60o/fQShHxSMHjf9D2cDGpKvV7UWA6iXy8SKX37fNtM1Jj92+i4IRdOZ1VSf+H1+ZN55PmOypcmMvVWsuJjtHlYxx/aET8R69t451G13SHMLh/FdiYVE/eXtosPM+CqnedO4FUr9rXnCxKvWy+RpoNcVtSiWJaROxXMh/7RMQZvbaNJ6UpaT9JGlhxOPAwMDfy0Okex65F6mm0LykInEbqTtnzElzSARHxY0kf6fZ8iTYYlGcx7LWtQDq7Ap8hlVL/NSIuLHN8FcpdFmv6wb4SeHWrPUipv/kFY7WBdBy/Q0Rc3mtbgXTWJxW+tgfeCKwVEWuWOL6280Mdc9wUbV8b5btVqoG7jjS6pjuEwf0HXTaXab1eruRf9mog1zN3y8RnCx5fVy+VvoLSaAGxpUxg7Eh3KqmnzA19HLseaWrWj5AaNE/qsf/7IuK7Vf4XktYhNQb/mNSQ27pyWB04PiJeVDDvu5NKiA8An+8jiF0WETtq2V4mQLkpg+siaW5EbNFr2xjH9/1jqTSp3vakoP43UjfI3+S/81pXE+Odj7b9u85x0+vKNLfpvZVUrfPrtqeeATwRET0HLY6Rxuqk9oR+Bj4uMVR17rB0Mp2ylHpyrAqsrdQ9qf1Efm7JPBQK4mMcfzdpYEZflEa0vgFYr6PeenWKjeJrNUS/kNRDpLWYwx4UHxnaysuSH8aI+EPntoJpbEUK7K8BzqXAiOMc2CeRumD22yj8OtJo1vVJJ3DrO/Eg6WqkqLNJdcP3AP/S0YDXs8488iCZqGHKYEn7AOdFxENKYym2IlVFXFcimb9I2qpVlaPUi+nRAq+9HSkwj3QUIFaneLvDVOBMUltB4amfO/JR9fxo9znS1fVFEbGlpJ1ZugjJWK4gjb1Ym2XbdB4i9d8voo40RjV0wT1fqn2L9MsepJ4Zh0bEwjEPTA0Rh5EC+RyWPZFLzbCmNFDn46R6wfZLtUKXvqo+h8idpNWK9mTZQPgQaSGSMbV+nCRdQJoQ6qH8+GhSdVdPdfxYSvos6XL7FtII1SPKtGNE6vGzJ332+Ml1+jMlvTlKLszRoXK3zNwwfkOUWPBlFJ+JiDNyvf/rSA16x7N0NtQiDgPOkHQn6Rx7LqnqrJenkXqArciyPdkeJDW89xQRY15VFlTp/OjQ1xw3uU7+j6T++X1ppSHp1eQ2EEmbkPrdl+0W2vUFhupGWoHpINIXaEVSyevCEsd/qIY8XEBqvb4FeBVp9aEvlTj+elI9+875+FcBr+ojH5Mrvo/5pIay1uOVSH2zixx7KGmk3WOkQTu359v1wMEF03iStD7kvHy7Id/mkQJdkTS+QGq7+AdKzmDYlsa/Amu2PX4mqXqlymdbKg/5mJMpOBvmGGm0ZlH8N+BkmTz3AAAS1klEQVSt7dsKHPtyYJ3Wd4u0Pu3F+fMtPGspsGGV91DXrf38yP/Tl/WRxkWkH6xvkSbu+g/gihLHb0tajOZh4HFS1+kHS+ZhDqkgtR5p8Y+fkqZiqPT5DGOde9X6wG6XrZ+Pcr0J5kTE1pJuiNwlStIvI6Jrb4cux9cyh4jS2qNHs3TGulYdbdERqp8C3kL6sgRpEM3pEfGvJfLwoSi2ok23Y7v2RGiJAj0SampA7Daqs3SDatXjJV1MCrB9T7ilNPf3/5Lac7YmVadcHcXm+rmW1JB6r1LX3FNJV5hbAC+OiEKl71y6/CjLX5mWWTOhMqWBaXvmPMwFFpHmgCp8ddDqOUSfc9xImk3qbHAGqUvoO4AXRMSnSuShtWzhh0hjW75cR4Pq0FXLkOZlOYD0Kwqp/qtMQ2S3y9bjKHfZ2prC9U+5Me1OUr1tUZXnEMlOoMt6sEVFxBcknUsq9ULqElqmbpaI+JbSoKypLHsi9xzEVCR4F0ijjpGqkyStFLmLnNI4iJUqpll4YJykF5D6uHe25byKFKjLeAtpycJ/j4j7lRZ2KDouZFIsnQJjX9Lsg2cBZykNgS/qDFJV0Pfp43tZozUiLbP3HuAHEXGU0iRxheQ2nf+K1HD5JGmRoNIiYoGkSRHxBPADpTV7y1Buz3gbqcYAaojNwxjc30W6TPw6qbR5Rd5WVOvLtjtwXET8V65rLuPzSnOPH066XFudcnV5LyXNIbILbfNVUG41KOhzPViluW1a/pBvS56LEnPcSDqJNJnRXJZ+tkGBEapdeocseYqCvUSUZgl9M8v/uBzT69g2PwZ+0dYT6yD6PJHblGl0/wbwyejoZSTpL6TRxieUSGttUn0zkqbkbfMLHjtJ0oqR2j12ZdmRrWViweKIOK7E/uNlxfzj9hZSb6ZSop454R9RmidnrqQvkxpIn14yjcOAI4CfRsRNShPSVZ7PfeiqZaqqctlaYx7mk+r/qswhgtI85JMouR6s0mi71qIhsDTAlqrWyWndQloCbCBfFEnnkbogLnP1EhHdRp2Olc5uLO2aeh+wbkSUXeIOSS9j+R+aMcdgSLoxRmlIVckJoiTNY+n/dmXgeaR1hosMKvsUqZfJ3aSRoltFROQri5kRsUPBPBxN6jb4U5b9XpaaGK+qXAX7GeCyiPhADopfiYg3l0jjdFK9eV9zwueqx7tIbRgfJlXrfCd6Lxw/7oYmuEv6FmMMYS7xYa9KumydFxG35l/2l0bEBSXyUqm3i6TTSA27VeYQqaW+uSpJZwCHRJ/d1mp4/VEDY8l0tiD1KX4LqWH4rIg4tmQaJ5LmFyo1g6CkBRHxgrLPFczTVqR5i95XcP9tSYttXBARf8nbNiFN+1F0rp3KaybUoexV6ChpVJ4TvsJrj8tcPy3DVC0zu+3+ZykwOVY3kRbHvYs0MOBWUr/XW0smU3XFnMpziOT9+6pvVo1D1UnVADcrrbrT93up4ApJL42I0l3DctDaj6XtNqeRCjT91uNvGyWmpG1zjaT3Rsd8IZLeTblVxpYTEdcqTVNRdP8ru2z7XcnXrGPNhDpcldsKfgCc28/VZaQ54Ufy/UVlj1da8P5zLN/pocjAtDrnp1k+b8NScm9XpaU4N2ROI038tYmk5wJnFL3kzGlU6u2iCnOIdKTzHFI3vudGxOslbQpsFxFj1tGq3qHqtbyXsiTdSPphXZE0HcVtlJvYCUlPkkb+vbt1mSzptn5LmErTUnw1ys8g+BxSFcbjLA3m00j9xv8pSkw/rGUHD61A6g22VvS/WEdpqmnNhBryIVJV27tIU/+eBvywyI9VPvYoUndQkT7LxcC3yrTnSFoAvIlUUzBUwXRYg3vf3dTyL/mWwLWxdNKuG4oEg7Y03koKKFVWzHkOqdsbpDr/0lU0uafLD4BPRcTmSvM8X1emjvbvlaT7SF30uirSE0dpXcr9SKMqzyN1/ft+vyXP3H3wbNLi0qV+aPLxO5MmyQK4KSIu7iMP7Ve0rUm7zoo+J0HrR652nENaCHqz3PvoN1Gwu/I45WlnUsP500ljMT4REb8ZY/8Pk9ofpkfE7Xnb80k9686LgqOicwFq1ygxbULbsa32k67KxKyu6TcwuLdmMWz1HX066YtXJrhXWjFH0luAr5AWERCpK+LHIuLMsY7rks41EfFyLTu7ZOE+/3n/vroxth2/LanH0ItJJc1JlFx1px9VvgNd0no6aVHu/Uk9lmaSeiYUbofJ6SwglViXWUy5yA9NkyivB6yKs57WkI+1SMvavZ209sIJpKk2tiBdrY/6Iy7pOtLMr3d3bB8htUcUqjnIVWKfI01fXWqSQdUwDmQsQ1Pn3tFtblVJD7aeotzkSqdL+i6wptLahO8irUZeRtUVcz4FvLxVWs9fmItIc2qU8Zf8BW7NTb8tqedIIarQjbHNsSw/SGPjEsf369kaYwK0IidP275/IY0OPVmpm+g+pDm0SwV34H8iYlbv3eqntDLZqCawDQTqWTOhDr8h1Vv/Yyw7PclsScf3OHZyZ2CHVO8uaXKJPHyBNDp1ZVLhp7DxLhQMTXCPipMqaemq6t8gDft/kDRx1pFRfmrWqivmrNBRDXMPqU6vrI+QSiIbSbocGKHgHB7ZNGroxhjVB2n0YxJpWHipVbR6yb0rvptvZc2X9J+kqpl+FlOuYjvS0PRTSCsY1fq5lHQ0qZprA0knk+aBeucA8vHC0b7bEdFrfpixCm5lCnXPiojX9t5t4g1NcK/BaKuq99MboWpvl/OUFvpujbLdl7REXSm5J8SrWLpm528j4m89Dmt3I7AOaWBFv+oYpNGPP5Vp2Jogq5C+D+0nc9FVoapahzSrZmua2P8GTomImybgtZcRERcoLd7SWjPh0G6l4PHSfhWjLitoFjxPN2+rHVgmedomCyzgIkmvLVvFNxGGss69CtWzqnpfPUSUh5lHxOVKS5G1VkS/jzRfxe+L5iGnN4k00nYqy9aZj1kl0dZv9hmk+se+uzHmesH/I11yTtggjSo9pppOadTu/qR2nWOiz7l/Krz+LFLBZVau8ppQkhYxxlXMePfk6sjLQ6TCzmOkaUsqzdGvNAPrBtHHmgnLpdXA4L4GKaDvkP+uSeqmVGqe+H56uyiNju02zHwacFREjLUuaLf0zgH+yvINeGMOfc9tDc9h2QUAIM9l0qsrZU5jShRcjWY8qIYBKnVRTQPsasjHSqQf+/1JP/izgBMjouz8NFXz8SrS1ejupILDacDPJ6rHTi70tK5iXsYAr2LqoBomQOuablOCu5ZfVf1K0hJ7pVdV77e3i2ocZp6PKdWFs+24yj8y7b1VJJ0VJYZ0N41GGcXYEhMzmnEmqRvlucCpEXHjeL9mLznI7gK8F9htvHtQjZKHgVzFqN41ba+LtFDIe0il9qP6PffbNanOvc5V1fvt7TJWXd0qfeTj3D7r86Z2u6yLiNlKS+UV0X6pO6HDyofNRATvAt5OmvtkE+CQtrrmQS3VtwppZa99SQOpJvQz6nIV800mpu2j5SOkide6zXFUdpLAShOgjZpoXQkNWkTslkedtVZVPxzYTFLpVdXpv7dL3cPMrwR+qrSKT5n6vDp+ZGKU+085Guc5QIqIiH56W42LPIjpFaQeM98GLu1nEE+F12+/ivnsIK5iImJ6/lvHlNTHAOcDl0fENXkwVdkpU5bTmGqZdqq+qvpXSHV57b1d5kXEx3scV9sw85zebaTBN6WGNks6Bbh4lB+Z10ZEzyXVJD1BKimK9IPwSOspBlBSHCRJW0fEnH4b2ptGaYbNC3PX2EG8/pMsncFxGBYcrzRQcLw0JrirxlXVc3rtvV1+FRE/LXFs5WHmOZ3zgdf3kfdaf2Se6gbduDws8jkxqgnq7z9URhsoWKaRXWmCu+NIPe02U5pWes+I+HylvDUouH+N3Lc9ap6eNjcc7RcRJ9eZboHX/SGpvvtcSg5tzsfX8iPzVOfG5URLFzt5Nqkg1fo+7Uyqmhkz+DeRaljvQNIvSatpfTeWTudQearrJtW5V15VXdLqwAdJC9XOIk3g/0HSBz+XNIR9IrUWpX4aJYc2A0TEJdSwoou5cRmg1Z0498batFWIyo2B3x5k3gaojoGCq0bE1R0DshZXyhUNCu41OYk04Og3wHtIQf1pwF4RUWaNyVr06s9uE8aNy8ua2nF1/H+kUdRPRd3WO4iI2KtEGnfn+Xlac/XsTbUfC6BB1TJ1aO+Lnqti7ibNWf3QgPJzCd17Z0zoKvNPdW5cXpakY0mTx51C+n7uB9w6UYO5hklHI7tI7XT7R4FlD9vSeD4wg1TVdR/pav1t0ZRZIYfEknlbIi2ee/ugAnv20bb7K5MWiq58uWblRMSkQedhmETEwUpz5b8yb/oNaUT0U05E/FLLL+HYa0bKJXI352kR8WqlqalXqCvmOLgvq30yIQGr5McDKaFFRGff+Mtz44vZoN1Omt5jyZq0g83OxFJNSzhGxJOSDgZOj5rn6XFwbzNsJTSlucdbVgC2JjXemE24ugJaQ8wnzd20RyxdwvHDfaZ1oaSPkj7PJQG+6txKrnMfYkqrzAfpymExqYR0TERcNtCM2VOSal6T9u+ZalzCMZ/nnaLq5+rgbmaF1BnQmkI1LeE4Hhzch5ikD5Lmgb8/P34mqSX+O4PNmT2VDXNAGyQtXcJx37I92sZjCgMH9yGmLothy4tY2BCpEtAsqWMKg67pOrgPL0k3AJu3hjbnvvc3lOlDa2bDrY4pDLoZmmlEravzgdMl7SppF9KgkfMGnCczq1drCoNaueQ+xPIAh/cBu5J6zFxAasAayFSrZla/PBK90lrHXdN1cB9uSgt+v5DUJfK3EfG3HoeY2d+R8VonwMF9iEnaidQT4Q+kkvsGwIER8asBZsvMaiZpQ2DjiLhI0qrApKrTEDi4DzFJc4C3RsRv8+NNSKu8bz3YnJlZXSS9l7Qe67MiYiNJGwPHR8SuVdJ1g+pwm9wK7AAR8Ttg8gDzY2b1+yBpBbkHASLiVtKCKJV4bpnhNlvSCaR55gHeRn8LbZvZ8HosIh5vLdYhaUVqWDfAwX24vZ/0q34IeS1XwKNTzZrll5I+SZqF9jXAB4CzqybqOvchJ2kEICIWDTovZla/3OX53cBrSYW480ldnisFZwf3IaR0fXYUcDDpny3SsORvRcQxg8ybmf19cLXMcDqM1MDy8oi4HZYsxXWcpA9HxNcHmjszq42keSxfx/4AMBv4fETc01e6LrkPH0nXAa+JiLs7to8AF3jiMLPmkPRl0pX5f+ZN++W/DwI7RsQe/aTrkvtwmtwZ2CHVu0tyV0izZtkhInZoezxP0uURsYOkA/pN1P3ch9PjfT5nZn9/VpP0itYDSdsAq+WHi/tN1NUyQ0jSE7Stpdj+FLByRLj0btYQkl4OnMjSgP4QqffMzcDuEXF6X+k6uJuZDZ6kNUgx+f5a0nNwNzNrHte5m5k1kIO7mVkDuSukmdkA5e7N7wdemTf9kjTlb6WFeVznbmY2QJK+T5rKe2be9HbgiYh4T6V0HdzNzAZH0vURsXmvbWW5zt3MbLCekLRR60GeR+qJqom6zt3MbLA+Blwi6TbSQMUNgYOqJupqGTOzAZO0EvBCUnCfHxGPVU7Twd3MbLAkbQ9Mpa02JSJ+VCVNV8uYmQ2QpJOAjYC5LK1rD6BScHfJ3cxsgCTdAmxadVm9Tu4tY2Y2WDcC69SdqKtlzMwGa23gZklXA0saUiNizyqJOribmQ3W0eORqOvczcwGQJJ61bMX2Wc0rnM3MxuMSyR9SNKU9o2SniZpF0kzgQP7TdwldzOzAZC0MvAu4G3A84D7gZWBScAFwLcjYm7f6Tu4m5kNVp72d23gUS+zZ2Zmo3Kdu5lZAzm4m5k1kIO7mVkDObiblSRp0qDzYNaLg7s1nqTPSJov6UJJp0j6qKSNJJ0naY6kX0t6Ud73h5K+KekKSbdJ2jtv30nSJZL+E5iXtx0g6WpJcyV910HfhomDuzWapGnAm4EtgTcB0/JTM4APRcTWwEeB77Qdti6wI/BG4Itt27cBPhURm0p6MbAvsENEbEGaqvVt4/lezMrw3DLWdDsC/xURjwJIOps0UGR74AxJrf1WajvmZxHxJGkyp+e0bb86Im7P93cFtgauyWmsAtw1bu/CrCQHd2s6ddm2AnB/LnF3077EWfvxf+nYPjMijqiYP7Nx4WoZa7rLgD0krSxpNWB34BHgdkn7QJqcSdLmJdP9BbC3pGfnNJ4lacM6M25WhYO7NVpEXAPMAq4HfgLMBh4g1Y+/W9L1wE3AXiXTvRn4NHCBpBuAC0l19WZDwdMPWONJWi0iHpa0KvArYHpEXDvofJmNJ9e521PBDEmbkhpSZzqw21OBS+5mZg3kOnczswZycDczayAHdzOzBnJwNzNrIAd3M7MGcnA3M2ug/wc3bvn6CvS61AAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x6181ccd68>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "avg_rating_by_genre = genre_rating.groupBy(\"genre\").avg(\"rating\").toPandas()\n",
    "avg_rating_by_genre.index = avg_rating_by_genre[\"genre\"]\n",
    "avg_rating_by_genre[\"avg(rating)\"].plot.bar()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Enrich the movies dataset by computing the avg rating, rating count and number of unique user who has rated high."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>year</th>\n",
       "      <th>genre</th>\n",
       "      <th>userId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>rating_avg</th>\n",
       "      <th>rating_count</th>\n",
       "      <th>rating_std</th>\n",
       "      <th>unique_user_count</th>\n",
       "      <th>high_rating</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1339</td>\n",
       "      <td>Dracula (Bram Stoker's Dracula) (1992)</td>\n",
       "      <td>[Fantasy, Horror, Romance, Thriller]</td>\n",
       "      <td>1992</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2009-12-14 08:22:05</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2455</td>\n",
       "      <td>Fly, The (1986)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi, Thriller]</td>\n",
       "      <td>1986</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2009-12-14 08:21:53</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>253</td>\n",
       "      <td>Interview with the Vampire: The Vampire Chroni...</td>\n",
       "      <td>[Drama, Horror]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:41:51</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>273</td>\n",
       "      <td>Mary Shelley's Frankenstein (Frankenstein) (1994)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:46:19</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>382</td>\n",
       "      <td>Wolf (1994)</td>\n",
       "      <td>[Drama, Horror, Romance, Thriller]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1996-06-21 16:52:45</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>593</td>\n",
       "      <td>Silence of the Lambs, The (1991)</td>\n",
       "      <td>[Crime, Horror, Thriller]</td>\n",
       "      <td>1991</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1996-06-21 16:41:51</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>593</td>\n",
       "      <td>Silence of the Lambs, The (1991)</td>\n",
       "      <td>[Crime, Horror, Thriller]</td>\n",
       "      <td>1991</td>\n",
       "      <td>Horror</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2011-03-01 01:07:20</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2513</td>\n",
       "      <td>Pet Sematary (1989)</td>\n",
       "      <td>[Horror]</td>\n",
       "      <td>1989</td>\n",
       "      <td>Horror</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2011-02-28 08:26:29</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2762</td>\n",
       "      <td>Sixth Sense, The (1999)</td>\n",
       "      <td>[Drama, Horror, Mystery]</td>\n",
       "      <td>1999</td>\n",
       "      <td>Horror</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2011-03-01 01:10:57</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2841</td>\n",
       "      <td>Stir of Echoes (1999)</td>\n",
       "      <td>[Horror, Mystery, Thriller]</td>\n",
       "      <td>1999</td>\n",
       "      <td>Horror</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2011-02-28 08:25:33</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                                              title  \\\n",
       "0     1339             Dracula (Bram Stoker's Dracula) (1992)   \n",
       "1     2455                                    Fly, The (1986)   \n",
       "2      253  Interview with the Vampire: The Vampire Chroni...   \n",
       "3      273  Mary Shelley's Frankenstein (Frankenstein) (1994)   \n",
       "4      382                                        Wolf (1994)   \n",
       "5      593                   Silence of the Lambs, The (1991)   \n",
       "6      593                   Silence of the Lambs, The (1991)   \n",
       "7     2513                                Pet Sematary (1989)   \n",
       "8     2762                            Sixth Sense, The (1999)   \n",
       "9     2841                              Stir of Echoes (1999)   \n",
       "\n",
       "                                 genres  year   genre  userId  rating  \\\n",
       "0  [Fantasy, Horror, Romance, Thriller]  1992  Horror       1       3   \n",
       "1     [Drama, Horror, Sci-Fi, Thriller]  1986  Horror       1       2   \n",
       "2                       [Drama, Horror]  1994  Horror       2       4   \n",
       "3               [Drama, Horror, Sci-Fi]  1994  Horror       2       4   \n",
       "4    [Drama, Horror, Romance, Thriller]  1994  Horror       2       3   \n",
       "5             [Crime, Horror, Thriller]  1991  Horror       2       3   \n",
       "6             [Crime, Horror, Thriller]  1991  Horror       3       3   \n",
       "7                              [Horror]  1989  Horror       3       3   \n",
       "8              [Drama, Horror, Mystery]  1999  Horror       3       3   \n",
       "9           [Horror, Mystery, Thriller]  1999  Horror       3       4   \n",
       "\n",
       "             timestamp  rating_avg  rating_count  rating_std  \\\n",
       "0  2009-12-14 08:22:05    3.181296      8.823206    1.151496   \n",
       "1  2009-12-14 08:21:53    3.181296      8.823206    1.151496   \n",
       "2  1996-06-21 16:41:51    3.181296      8.823206    1.151496   \n",
       "3  1996-06-21 16:46:19    3.181296      8.823206    1.151496   \n",
       "4  1996-06-21 16:52:45    3.181296      8.823206    1.151496   \n",
       "5  1996-06-21 16:41:51    3.181296      8.823206    1.151496   \n",
       "6  2011-03-01 01:07:20    3.181296      8.823206    1.151496   \n",
       "7  2011-02-28 08:26:29    3.181296      8.823206    1.151496   \n",
       "8  2011-03-01 01:10:57    3.181296      8.823206    1.151496   \n",
       "9  2011-02-28 08:25:33    3.181296      8.823206    1.151496   \n",
       "\n",
       "   unique_user_count  high_rating  \n",
       "0           6.388561     7.954723  \n",
       "1           6.388561     7.954723  \n",
       "2           6.388561     7.954723  \n",
       "3           6.388561     7.954723  \n",
       "4           6.388561     7.954723  \n",
       "5           6.388561     7.954723  \n",
       "6           6.388561     7.954723  \n",
       "7           6.388561     7.954723  \n",
       "8           6.388561     7.954723  \n",
       "9           6.388561     7.954723  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie_genre_stats = (genre_rating\n",
    " .withColumn(\"rating_avg\", F.expr(\"avg(rating) over (partition by genre)\"))\n",
    " .withColumn(\"rating_count\", F.expr(\"count(*) over (partition by genre)\"))\n",
    " .withColumn(\"rating_count\", F.log(\"rating_count\"))\n",
    " .withColumn(\"rating_std\", F.expr(\"stddev(rating) over (partition by genre)\"))\n",
    " .withColumn(\"unique_user_count_rank\", F.expr(\"dense_rank() over (partition by genre order by userId)\"))\n",
    " .withColumn(\"unique_user_count\", F.expr(\"max(unique_user_count_rank) over (partition by genre)\"))\n",
    " .withColumn(\"unique_user_count\", F.log(\"unique_user_count\"))\n",
    " .withColumn(\"high_rating\", F.expr(\"sum(if(rating = 4 or rating = 5, 1, 0)) over (partition by genre)\"))\n",
    " .withColumn(\"high_rating\", F.expr(\"log(high_rating)\"))\n",
    " .drop(\"unique_user_count_rank\")\n",
    ")\n",
    "movie_genre_stats.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since each movie has multiple genres, it is a good idea to calculated IDF scores weighted by the individual measures like avg rating. To calculate IDF scores, we need to pass the data through a few steps => StringIndexer => Group by movie ID =>  Vectorizer => IDF transformer.\n",
    "\n",
    "StringIndexer replaces the each string value of the genre with a corresponding index. \n",
    "Grouping aggregate genre level stat for each userId by packing them into a set column. Look at the below schem for for example.  \n",
    "Vectorizes - Create a sparse vector that is required by TF-IDF."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import StringIndexer, IDF\n",
    "from pyspark.ml.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>year</th>\n",
       "      <th>genre</th>\n",
       "      <th>userId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>rating_avg</th>\n",
       "      <th>rating_count</th>\n",
       "      <th>rating_std</th>\n",
       "      <th>unique_user_count</th>\n",
       "      <th>high_rating</th>\n",
       "      <th>genre_index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1339</td>\n",
       "      <td>Dracula (Bram Stoker's Dracula) (1992)</td>\n",
       "      <td>[Fantasy, Horror, Romance, Thriller]</td>\n",
       "      <td>1992</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2009-12-14 08:22:05</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2455</td>\n",
       "      <td>Fly, The (1986)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi, Thriller]</td>\n",
       "      <td>1986</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2009-12-14 08:21:53</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>253</td>\n",
       "      <td>Interview with the Vampire: The Vampire Chroni...</td>\n",
       "      <td>[Drama, Horror]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:41:51</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>273</td>\n",
       "      <td>Mary Shelley's Frankenstein (Frankenstein) (1994)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:46:19</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>382</td>\n",
       "      <td>Wolf (1994)</td>\n",
       "      <td>[Drama, Horror, Romance, Thriller]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1996-06-21 16:52:45</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                                              title  \\\n",
       "0     1339             Dracula (Bram Stoker's Dracula) (1992)   \n",
       "1     2455                                    Fly, The (1986)   \n",
       "2      253  Interview with the Vampire: The Vampire Chroni...   \n",
       "3      273  Mary Shelley's Frankenstein (Frankenstein) (1994)   \n",
       "4      382                                        Wolf (1994)   \n",
       "\n",
       "                                 genres  year   genre  userId  rating  \\\n",
       "0  [Fantasy, Horror, Romance, Thriller]  1992  Horror       1       3   \n",
       "1     [Drama, Horror, Sci-Fi, Thriller]  1986  Horror       1       2   \n",
       "2                       [Drama, Horror]  1994  Horror       2       4   \n",
       "3               [Drama, Horror, Sci-Fi]  1994  Horror       2       4   \n",
       "4    [Drama, Horror, Romance, Thriller]  1994  Horror       2       3   \n",
       "\n",
       "             timestamp  rating_avg  rating_count  rating_std  \\\n",
       "0  2009-12-14 08:22:05    3.181296      8.823206    1.151496   \n",
       "1  2009-12-14 08:21:53    3.181296      8.823206    1.151496   \n",
       "2  1996-06-21 16:41:51    3.181296      8.823206    1.151496   \n",
       "3  1996-06-21 16:46:19    3.181296      8.823206    1.151496   \n",
       "4  1996-06-21 16:52:45    3.181296      8.823206    1.151496   \n",
       "\n",
       "   unique_user_count  high_rating  genre_index  \n",
       "0           6.388561     7.954723         11.0  \n",
       "1           6.388561     7.954723         11.0  \n",
       "2           6.388561     7.954723         11.0  \n",
       "3           6.388561     7.954723         11.0  \n",
       "4           6.388561     7.954723         11.0  "
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genre_indexer = StringIndexer(inputCol=\"genre\", outputCol=\"genre_index\")\n",
    "stats_by_genre_indexed = genre_indexer.fit(movie_genre_stats).transform(movie_genre_stats)\n",
    "stats_by_genre_indexed.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movieId</th>\n",
       "      <th>title</th>\n",
       "      <th>genres</th>\n",
       "      <th>year</th>\n",
       "      <th>genre</th>\n",
       "      <th>userId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>rating_avg</th>\n",
       "      <th>rating_count</th>\n",
       "      <th>rating_std</th>\n",
       "      <th>unique_user_count</th>\n",
       "      <th>high_rating</th>\n",
       "      <th>genre_index</th>\n",
       "      <th>rating_count_by_genre</th>\n",
       "      <th>rating_avg_by_genre</th>\n",
       "      <th>rating_std_by_genre</th>\n",
       "      <th>unique_user_count_by_genre</th>\n",
       "      <th>high_rating_by_genre</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1339</td>\n",
       "      <td>Dracula (Bram Stoker's Dracula) (1992)</td>\n",
       "      <td>[Fantasy, Horror, Romance, Thriller]</td>\n",
       "      <td>1992</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2009-12-14 08:22:05</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "      <td>(11.0, 8.823206220552741)</td>\n",
       "      <td>(11.0, 3.1812960235640646)</td>\n",
       "      <td>(11.0, 1.1514958279439718)</td>\n",
       "      <td>(11.0, 6.38856140554563)</td>\n",
       "      <td>(11.0, 7.954723334497908)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2455</td>\n",
       "      <td>Fly, The (1986)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi, Thriller]</td>\n",
       "      <td>1986</td>\n",
       "      <td>Horror</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2009-12-14 08:21:53</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "      <td>(11.0, 8.823206220552741)</td>\n",
       "      <td>(11.0, 3.1812960235640646)</td>\n",
       "      <td>(11.0, 1.1514958279439718)</td>\n",
       "      <td>(11.0, 6.38856140554563)</td>\n",
       "      <td>(11.0, 7.954723334497908)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>253</td>\n",
       "      <td>Interview with the Vampire: The Vampire Chroni...</td>\n",
       "      <td>[Drama, Horror]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:41:51</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "      <td>(11.0, 8.823206220552741)</td>\n",
       "      <td>(11.0, 3.1812960235640646)</td>\n",
       "      <td>(11.0, 1.1514958279439718)</td>\n",
       "      <td>(11.0, 6.38856140554563)</td>\n",
       "      <td>(11.0, 7.954723334497908)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>273</td>\n",
       "      <td>Mary Shelley's Frankenstein (Frankenstein) (1994)</td>\n",
       "      <td>[Drama, Horror, Sci-Fi]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1996-06-21 16:46:19</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "      <td>(11.0, 8.823206220552741)</td>\n",
       "      <td>(11.0, 3.1812960235640646)</td>\n",
       "      <td>(11.0, 1.1514958279439718)</td>\n",
       "      <td>(11.0, 6.38856140554563)</td>\n",
       "      <td>(11.0, 7.954723334497908)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>382</td>\n",
       "      <td>Wolf (1994)</td>\n",
       "      <td>[Drama, Horror, Romance, Thriller]</td>\n",
       "      <td>1994</td>\n",
       "      <td>Horror</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1996-06-21 16:52:45</td>\n",
       "      <td>3.181296</td>\n",
       "      <td>8.823206</td>\n",
       "      <td>1.151496</td>\n",
       "      <td>6.388561</td>\n",
       "      <td>7.954723</td>\n",
       "      <td>11.0</td>\n",
       "      <td>(11.0, 8.823206220552741)</td>\n",
       "      <td>(11.0, 3.1812960235640646)</td>\n",
       "      <td>(11.0, 1.1514958279439718)</td>\n",
       "      <td>(11.0, 6.38856140554563)</td>\n",
       "      <td>(11.0, 7.954723334497908)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   movieId                                              title  \\\n",
       "0     1339             Dracula (Bram Stoker's Dracula) (1992)   \n",
       "1     2455                                    Fly, The (1986)   \n",
       "2      253  Interview with the Vampire: The Vampire Chroni...   \n",
       "3      273  Mary Shelley's Frankenstein (Frankenstein) (1994)   \n",
       "4      382                                        Wolf (1994)   \n",
       "\n",
       "                                 genres  year   genre  userId  rating  \\\n",
       "0  [Fantasy, Horror, Romance, Thriller]  1992  Horror       1       3   \n",
       "1     [Drama, Horror, Sci-Fi, Thriller]  1986  Horror       1       2   \n",
       "2                       [Drama, Horror]  1994  Horror       2       4   \n",
       "3               [Drama, Horror, Sci-Fi]  1994  Horror       2       4   \n",
       "4    [Drama, Horror, Romance, Thriller]  1994  Horror       2       3   \n",
       "\n",
       "             timestamp  rating_avg  rating_count  rating_std  \\\n",
       "0  2009-12-14 08:22:05    3.181296      8.823206    1.151496   \n",
       "1  2009-12-14 08:21:53    3.181296      8.823206    1.151496   \n",
       "2  1996-06-21 16:41:51    3.181296      8.823206    1.151496   \n",
       "3  1996-06-21 16:46:19    3.181296      8.823206    1.151496   \n",
       "4  1996-06-21 16:52:45    3.181296      8.823206    1.151496   \n",
       "\n",
       "   unique_user_count  high_rating  genre_index      rating_count_by_genre  \\\n",
       "0           6.388561     7.954723         11.0  (11.0, 8.823206220552741)   \n",
       "1           6.388561     7.954723         11.0  (11.0, 8.823206220552741)   \n",
       "2           6.388561     7.954723         11.0  (11.0, 8.823206220552741)   \n",
       "3           6.388561     7.954723         11.0  (11.0, 8.823206220552741)   \n",
       "4           6.388561     7.954723         11.0  (11.0, 8.823206220552741)   \n",
       "\n",
       "          rating_avg_by_genre         rating_std_by_genre  \\\n",
       "0  (11.0, 3.1812960235640646)  (11.0, 1.1514958279439718)   \n",
       "1  (11.0, 3.1812960235640646)  (11.0, 1.1514958279439718)   \n",
       "2  (11.0, 3.1812960235640646)  (11.0, 1.1514958279439718)   \n",
       "3  (11.0, 3.1812960235640646)  (11.0, 1.1514958279439718)   \n",
       "4  (11.0, 3.1812960235640646)  (11.0, 1.1514958279439718)   \n",
       "\n",
       "  unique_user_count_by_genre       high_rating_by_genre  \n",
       "0   (11.0, 6.38856140554563)  (11.0, 7.954723334497908)  \n",
       "1   (11.0, 6.38856140554563)  (11.0, 7.954723334497908)  \n",
       "2   (11.0, 6.38856140554563)  (11.0, 7.954723334497908)  \n",
       "3   (11.0, 6.38856140554563)  (11.0, 7.954723334497908)  \n",
       "4   (11.0, 6.38856140554563)  (11.0, 7.954723334497908)  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats_by_genre_structed = (stats_by_genre_indexed\n",
    "    .withColumn(\"rating_count_by_genre\", F.struct(\"genre_index\", F.col(\"rating_count\").alias(\"value\")))\n",
    "    .withColumn(\"rating_avg_by_genre\", F.struct(\"genre_index\", F.col(\"rating_avg\").alias(\"value\")))\n",
    "    .withColumn(\"rating_std_by_genre\", F.struct(\"genre_index\", F.col(\"rating_std\").alias(\"value\")))\n",
    "    .withColumn(\"unique_user_count_by_genre\", F.struct(\"genre_index\", F.col(\"unique_user_count\").alias(\"value\")))\n",
    "    .withColumn(\"high_rating_by_genre\", F.struct(\"genre_index\", F.col(\"high_rating\").alias(\"value\")))\n",
    "    )\n",
    "stats_by_genre_structed.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>rating_count_by_genre</th>\n",
       "      <th>rating_avg_by_genre</th>\n",
       "      <th>rating_std_by_genre</th>\n",
       "      <th>unique_user_count_by_genre</th>\n",
       "      <th>high_rating_by_genre</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12</td>\n",
       "      <td>[(10.0, 8.939187601475613), (2.0, 10.205664071...</td>\n",
       "      <td>[(12.0, 3.4912479740680715), (16.0, 3.44926778...</td>\n",
       "      <td>[(3.0, 1.0866794748049233), (0.0, 1.0486263341...</td>\n",
       "      <td>[(17.0, 5.552959584921617), (1.0, 6.5072777123...</td>\n",
       "      <td>[(12.0, 8.117312461601975), (5.0, 9.2159247502...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>18</td>\n",
       "      <td>[(10.0, 8.939187601475613), (2.0, 10.205664071...</td>\n",
       "      <td>[(11.0, 3.1812960235640646), (3.0, 3.384627575...</td>\n",
       "      <td>[(3.0, 1.0866794748049233), (2.0, 1.1159512760...</td>\n",
       "      <td>[(2.0, 6.508769136971682), (3.0, 6.50578406012...</td>\n",
       "      <td>[(10.0, 8.38548870041881), (5.0, 9.21592475027...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38</td>\n",
       "      <td>[(10.0, 8.939187601475613), (2.0, 10.205664071...</td>\n",
       "      <td>[(13.0, 3.697313432835821), (16.0, 3.449267782...</td>\n",
       "      <td>[(3.0, 1.0866794748049233), (0.0, 1.0486263341...</td>\n",
       "      <td>[(17.0, 5.552959584921617), (1.0, 6.5072777123...</td>\n",
       "      <td>[(12.0, 8.117312461601975), (5.0, 9.2159247502...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>67</td>\n",
       "      <td>[(10.0, 8.939187601475613), (2.0, 10.205664071...</td>\n",
       "      <td>[(13.0, 3.697313432835821), (16.0, 3.449267782...</td>\n",
       "      <td>[(3.0, 1.0866794748049233), (0.0, 1.0486263341...</td>\n",
       "      <td>[(1.0, 6.507277712385012), (4.0, 6.50727771238...</td>\n",
       "      <td>[(12.0, 8.117312461601975), (5.0, 9.2159247502...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>70</td>\n",
       "      <td>[(10.0, 8.939187601475613), (2.0, 10.205664071...</td>\n",
       "      <td>[(12.0, 3.4912479740680715), (2.0, 3.304590479...</td>\n",
       "      <td>[(3.0, 1.0866794748049233), (0.0, 1.0486263341...</td>\n",
       "      <td>[(1.0, 6.507277712385012), (4.0, 6.50727771238...</td>\n",
       "      <td>[(12.0, 8.117312461601975), (5.0, 9.2159247502...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId                              rating_count_by_genre  \\\n",
       "0      12  [(10.0, 8.939187601475613), (2.0, 10.205664071...   \n",
       "1      18  [(10.0, 8.939187601475613), (2.0, 10.205664071...   \n",
       "2      38  [(10.0, 8.939187601475613), (2.0, 10.205664071...   \n",
       "3      67  [(10.0, 8.939187601475613), (2.0, 10.205664071...   \n",
       "4      70  [(10.0, 8.939187601475613), (2.0, 10.205664071...   \n",
       "\n",
       "                                 rating_avg_by_genre  \\\n",
       "0  [(12.0, 3.4912479740680715), (16.0, 3.44926778...   \n",
       "1  [(11.0, 3.1812960235640646), (3.0, 3.384627575...   \n",
       "2  [(13.0, 3.697313432835821), (16.0, 3.449267782...   \n",
       "3  [(13.0, 3.697313432835821), (16.0, 3.449267782...   \n",
       "4  [(12.0, 3.4912479740680715), (2.0, 3.304590479...   \n",
       "\n",
       "                                 rating_std_by_genre  \\\n",
       "0  [(3.0, 1.0866794748049233), (0.0, 1.0486263341...   \n",
       "1  [(3.0, 1.0866794748049233), (2.0, 1.1159512760...   \n",
       "2  [(3.0, 1.0866794748049233), (0.0, 1.0486263341...   \n",
       "3  [(3.0, 1.0866794748049233), (0.0, 1.0486263341...   \n",
       "4  [(3.0, 1.0866794748049233), (0.0, 1.0486263341...   \n",
       "\n",
       "                          unique_user_count_by_genre  \\\n",
       "0  [(17.0, 5.552959584921617), (1.0, 6.5072777123...   \n",
       "1  [(2.0, 6.508769136971682), (3.0, 6.50578406012...   \n",
       "2  [(17.0, 5.552959584921617), (1.0, 6.5072777123...   \n",
       "3  [(1.0, 6.507277712385012), (4.0, 6.50727771238...   \n",
       "4  [(1.0, 6.507277712385012), (4.0, 6.50727771238...   \n",
       "\n",
       "                                high_rating_by_genre  \n",
       "0  [(12.0, 8.117312461601975), (5.0, 9.2159247502...  \n",
       "1  [(10.0, 8.38548870041881), (5.0, 9.21592475027...  \n",
       "2  [(12.0, 8.117312461601975), (5.0, 9.2159247502...  \n",
       "3  [(12.0, 8.117312461601975), (5.0, 9.2159247502...  \n",
       "4  [(12.0, 8.117312461601975), (5.0, 9.2159247502...  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stats_by_genre_grouped_by_user = (stats_by_genre_structed\n",
    "                                   .groupBy(\"userId\")\n",
    "                                   .agg(\n",
    "                                     F.collect_set(\"rating_count_by_genre\").alias(\"rating_count_by_genre\")\n",
    "                                   , F.collect_set(\"rating_avg_by_genre\").alias(\"rating_avg_by_genre\")\n",
    "                                   , F.collect_set(\"rating_std_by_genre\").alias(\"rating_std_by_genre\")\n",
    "                                   , F.collect_set(\"unique_user_count_by_genre\").alias(\"unique_user_count_by_genre\")\n",
    "                                   , F.collect_set(\"high_rating_by_genre\").alias(\"high_rating_by_genre\")\n",
    "                                  ))\n",
    "\n",
    "stats_by_genre_grouped_by_user.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- rating_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_avg_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_std_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- unique_user_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- high_rating_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "stats_by_genre_grouped_by_user.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- rating_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_avg_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_std_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- unique_user_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- high_rating_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_count_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_std_by_genre_vec: vector (nullable = true)\n",
      " |-- unique_user_count_by_genre_vec: vector (nullable = true)\n",
      " |-- high_rating_by_genre_vec: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from pyspark.ml.linalg import SparseVector, VectorUDT\n",
    "\n",
    "def to_vector(array_of_structs, size = 20):\n",
    "    if array_of_structs is not None:\n",
    "        array_of_structs = sorted(array_of_structs, key = lambda t: t.genre_index)\n",
    "        indices = [s.genre_index for s in array_of_structs]\n",
    "        values = [s.value for s in array_of_structs]\n",
    "        return SparseVector(size, indices, values)\n",
    "    return SparseVector(size, [], [])\n",
    "\n",
    "spark.udf.register(\"to_vector\", to_vector, VectorUDT())\n",
    "\n",
    "stats_by_genre_grouped_by_user_vec = (stats_by_genre_grouped_by_user\n",
    "            .withColumn(\"rating_count_by_genre_vec\", F.expr(\"to_vector(rating_count_by_genre)\"))\n",
    "            .withColumn(\"rating_avg_by_genre_vec\", F.expr(\"to_vector(rating_avg_by_genre)\"))\n",
    "            .withColumn(\"rating_std_by_genre_vec\", F.expr(\"to_vector(rating_std_by_genre)\"))\n",
    "            .withColumn(\"unique_user_count_by_genre_vec\", F.expr(\"to_vector(unique_user_count_by_genre)\"))\n",
    "            .withColumn(\"high_rating_by_genre_vec\", F.expr(\"to_vector(high_rating_by_genre)\"))\n",
    ")\n",
    "stats_by_genre_grouped_by_user_vec.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import IDF\n",
    "from pyspark.ml.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "pipe = Pipeline(stages = [\n",
    "    IDF(inputCol=\"rating_count_by_genre_vec\", outputCol=\"rating_count_by_genre_idf\"),\n",
    "    IDF(inputCol=\"rating_avg_by_genre_vec\", outputCol=\"rating_avg_by_genre_idf\"),\n",
    "    IDF(inputCol= \"rating_std_by_genre_vec\", outputCol=\"rating_std_by_genre_idf\"),\n",
    "    IDF(inputCol=\"unique_user_count_by_genre_vec\", outputCol=\"unique_user_by_genre_idf\"),\n",
    "    IDF(inputCol=\"high_rating_by_genre_vec\", outputCol=\"high_rating_by_genre_idf\")\n",
    "])\n",
    "\n",
    "user_profile_idf = pipe.fit(stats_by_genre_grouped_by_user_vec).transform(stats_by_genre_grouped_by_user_vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- rating_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_avg_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_std_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- unique_user_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- high_rating_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_count_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_std_by_genre_vec: vector (nullable = true)\n",
      " |-- unique_user_count_by_genre_vec: vector (nullable = true)\n",
      " |-- high_rating_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_count_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_std_by_genre_idf: vector (nullable = true)\n",
      " |-- unique_user_by_genre_idf: vector (nullable = true)\n",
      " |-- high_rating_by_genre_idf: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "user_profile_idf.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- movieId: integer (nullable = true)\n",
      " |-- rating: integer (nullable = true)\n",
      " |-- timestamp: integer (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- genres: string (nullable = true)\n",
      " |-- rating_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_avg_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_std_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- unique_user_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- high_rating_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_count_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_std_by_genre_vec: vector (nullable = true)\n",
      " |-- unique_user_count_by_genre_vec: vector (nullable = true)\n",
      " |-- high_rating_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_count_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_std_by_genre_idf: vector (nullable = true)\n",
      " |-- unique_user_by_genre_idf: vector (nullable = true)\n",
      " |-- high_rating_by_genre_idf: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "enriched = (ratings\n",
    "            .join(movies, on = \"movieId\")\n",
    "            .join(user_profile_idf, on = \"userId\")\n",
    "            .withColumn(\"rating\", F.expr(\"cast(rating * 2 as int)\"))\n",
    "#            .withColumn(\"year\", F.regexp_extract(\"title\", r\"\\d+\", 1))\n",
    "#            .withColumn(\"age\", F.expr(\"year(from_unixtime(timestamp)) - year\"))\n",
    "           )\n",
    "enriched.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.feature import VectorAssembler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "vector_assembler = VectorAssembler(outputCol=\"features\", inputCols=[\n",
    "                                        'rating_count_by_genre_idf',\n",
    "                                        'rating_avg_by_genre_idf',\n",
    "                                        'rating_std_by_genre_idf',\n",
    "                                        'unique_user_by_genre_idf',\n",
    "                                        'high_rating_by_genre_idf'])\n",
    "enriched_vec = vector_assembler.transform(enriched)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train, df_test = enriched_vec.randomSplit([0.7, 0.3], 1)\n",
    "cache_df(df_train, \"df_train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- userId: integer (nullable = true)\n",
      " |-- movieId: integer (nullable = true)\n",
      " |-- rating: integer (nullable = true)\n",
      " |-- timestamp: integer (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- genres: string (nullable = true)\n",
      " |-- rating_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_avg_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_std_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- unique_user_count_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- high_rating_by_genre: array (nullable = true)\n",
      " |    |-- element: struct (containsNull = true)\n",
      " |    |    |-- genre_index: double (nullable = false)\n",
      " |    |    |-- value: double (nullable = true)\n",
      " |-- rating_count_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_std_by_genre_vec: vector (nullable = true)\n",
      " |-- unique_user_count_by_genre_vec: vector (nullable = true)\n",
      " |-- high_rating_by_genre_vec: vector (nullable = true)\n",
      " |-- rating_count_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_avg_by_genre_idf: vector (nullable = true)\n",
      " |-- rating_std_by_genre_idf: vector (nullable = true)\n",
      " |-- unique_user_by_genre_idf: vector (nullable = true)\n",
      " |-- high_rating_by_genre_idf: vector (nullable = true)\n",
      " |-- features: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df_train.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import LogisticRegression\n",
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test accuracy = 0.361758 \n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression(family=\"multinomial\", maxIter=10, regParam=0.0\n",
    "                    , elasticNetParam=0.8, featuresCol=\"features\", labelCol=\"rating\")\n",
    "\n",
    "lr_model= lr.fit(df_train)\n",
    "                \n",
    "predictions = lr_model.transform(df_test)\n",
    "\n",
    "evaluator = MulticlassClassificationEvaluator(\n",
    "      labelCol=\"rating\"\n",
    "    , predictionCol=\"prediction\"\n",
    "    , metricName=\"accuracy\")\n",
    "accuracy = evaluator.evaluate(predictions)\n",
    "print(\"Test accuracy = %g \" % (accuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(rating=10),\n",
       " Row(rating=2),\n",
       " Row(rating=6),\n",
       " Row(rating=4),\n",
       " Row(rating=8),\n",
       " Row(rating=0)]"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.select(\"rating\").distinct().collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
